aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/kernel-api.tmpl8
-rw-r--r--Documentation/debugging-via-ohci1394.txt179
-rw-r--r--Documentation/kernel-parameters.txt51
-rw-r--r--Documentation/x86_64/boot-options.txt8
-rw-r--r--Documentation/x86_64/uefi.txt9
-rw-r--r--arch/arm/Kconfig5
-rw-r--r--arch/ia64/Kconfig8
-rw-r--r--arch/ia64/hp/sim/simscsi.c1
-rw-r--r--arch/ia64/ia32/binfmt_elf32.c3
-rw-r--r--arch/ia64/kernel/module.c2
-rw-r--r--arch/m32r/Kconfig5
-rw-r--r--arch/mips/Kconfig5
-rw-r--r--arch/mips/kernel/i8253.c12
-rw-r--r--arch/parisc/Kconfig5
-rw-r--r--arch/powerpc/Kconfig8
-rw-r--r--arch/powerpc/kernel/ptrace.c52
-rw-r--r--arch/powerpc/kernel/vio.c13
-rw-r--r--arch/sparc64/Kconfig8
-rw-r--r--arch/um/kernel/ksyms.c4
-rw-r--r--arch/um/sys-i386/signal.c50
-rw-r--r--arch/um/sys-x86_64/signal.c70
-rw-r--r--arch/x86/Kconfig314
-rw-r--r--arch/x86/Kconfig.cpu65
-rw-r--r--arch/x86/Kconfig.debug129
-rw-r--r--arch/x86/Makefile249
-rw-r--r--arch/x86/Makefile_32175
-rw-r--r--arch/x86/Makefile_64144
-rw-r--r--arch/x86/boot/Makefile10
-rw-r--r--arch/x86/boot/apm.c3
-rw-r--r--arch/x86/boot/boot.h17
-rw-r--r--arch/x86/boot/cmdline.c65
-rw-r--r--arch/x86/boot/compressed/Makefile62
-rw-r--r--arch/x86/boot/compressed/Makefile_3250
-rw-r--r--arch/x86/boot/compressed/Makefile_6430
-rw-r--r--arch/x86/boot/compressed/misc.c (renamed from arch/x86/boot/compressed/misc_32.c)77
-rw-r--r--arch/x86/boot/compressed/misc_64.c371
-rw-r--r--arch/x86/boot/compressed/relocs.c7
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr (renamed from arch/x86/boot/compressed/vmlinux_64.scr)2
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_64.lds12
-rw-r--r--arch/x86/boot/edd.c13
-rw-r--r--arch/x86/boot/header.S5
-rw-r--r--arch/x86/boot/main.c31
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/boot/pmjump.S54
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c26
-rw-r--r--arch/x86/boot/video-vga.c20
-rw-r--r--arch/x86/boot/video.c33
-rw-r--r--arch/x86/boot/video.h3
-rw-r--r--arch/x86/boot/voyager.c4
-rw-r--r--arch/x86/configs/i386_defconfig4
-rw-r--r--arch/x86/configs/x86_64_defconfig9
-rw-r--r--arch/x86/ia32/Makefile41
-rw-r--r--arch/x86/ia32/audit.c2
-rw-r--r--arch/x86/ia32/fpu32.c183
-rw-r--r--arch/x86/ia32/ia32_aout.c246
-rw-r--r--arch/x86/ia32/ia32_binfmt.c285
-rw-r--r--arch/x86/ia32/ia32_signal.c472
-rw-r--r--arch/x86/ia32/ia32entry.S11
-rw-r--r--arch/x86/ia32/ipc32.c30
-rw-r--r--arch/x86/ia32/mmap32.c79
-rw-r--r--arch/x86/ia32/ptrace32.c404
-rw-r--r--arch/x86/ia32/sys_ia32.c504
-rw-r--r--arch/x86/ia32/syscall32.c83
-rw-r--r--arch/x86/ia32/syscall32_syscall.S17
-rw-r--r--arch/x86/ia32/tls32.c163
-rw-r--r--arch/x86/ia32/vsyscall-sigreturn.S143
-rw-r--r--arch/x86/ia32/vsyscall-sysenter.S95
-rw-r--r--arch/x86/ia32/vsyscall.lds80
-rw-r--r--arch/x86/kernel/Makefile96
-rw-r--r--arch/x86/kernel/Makefile_3288
-rw-r--r--arch/x86/kernel/Makefile_6445
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/sleep.c87
-rw-r--r--arch/x86/kernel/acpi/sleep_32.c70
-rw-r--r--arch/x86/kernel/acpi/sleep_64.c117
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S32
-rw-r--r--arch/x86/kernel/alternative.c40
-rw-r--r--arch/x86/kernel/aperture_64.c374
-rw-r--r--arch/x86/kernel/apic_32.c156
-rw-r--r--arch/x86/kernel/apic_64.c1257
-rw-r--r--arch/x86/kernel/apm_32.c379
-rw-r--r--arch/x86/kernel/asm-offsets_32.c65
-rw-r--r--arch/x86/kernel/asm-offsets_64.c56
-rw-r--r--arch/x86/kernel/bootflag.c50
-rw-r--r--arch/x86/kernel/bugs_64.c1
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/bugs.c5
-rw-r--r--arch/x86/kernel/cpu/common.c179
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c12
-rw-r--r--arch/x86/kernel/cpu/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c39
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c147
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h9
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c1
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/doublefault_32.c19
-rw-r--r--arch/x86/kernel/ds.c464
-rw-r--r--arch/x86/kernel/e820_32.c241
-rw-r--r--arch/x86/kernel/e820_64.c428
-rw-r--r--arch/x86/kernel/early-quirks.c127
-rw-r--r--arch/x86/kernel/efi.c512
-rw-r--r--arch/x86/kernel/efi_32.c618
-rw-r--r--arch/x86/kernel/efi_64.c134
-rw-r--r--arch/x86/kernel/efi_stub_64.S109
-rw-r--r--arch/x86/kernel/entry_32.S26
-rw-r--r--arch/x86/kernel/entry_64.S101
-rw-r--r--arch/x86/kernel/genapic_64.c15
-rw-r--r--arch/x86/kernel/geode_32.c48
-rw-r--r--arch/x86/kernel/head64.c63
-rw-r--r--arch/x86/kernel/head_32.S17
-rw-r--r--arch/x86/kernel/head_64.S48
-rw-r--r--arch/x86/kernel/hpet.c60
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/i387.c479
-rw-r--r--arch/x86/kernel/i387_32.c544
-rw-r--r--arch/x86/kernel/i387_64.c150
-rw-r--r--arch/x86/kernel/i8253.c72
-rw-r--r--arch/x86/kernel/i8259_32.c24
-rw-r--r--arch/x86/kernel/i8259_64.c160
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic_32.c13
-rw-r--r--arch/x86/kernel/io_apic_64.c112
-rw-r--r--arch/x86/kernel/io_delay.c114
-rw-r--r--arch/x86/kernel/ioport.c (renamed from arch/x86/kernel/ioport_32.c)85
-rw-r--r--arch/x86/kernel/ioport_64.c117
-rw-r--r--arch/x86/kernel/irq_32.c22
-rw-r--r--arch/x86/kernel/irq_64.c30
-rw-r--r--arch/x86/kernel/kdebugfs.c65
-rw-r--r--arch/x86/kernel/kprobes.c1066
-rw-r--r--arch/x86/kernel/kprobes_32.c756
-rw-r--r--arch/x86/kernel/kprobes_64.c749
-rw-r--r--arch/x86/kernel/ldt.c (renamed from arch/x86/kernel/ldt_32.c)112
-rw-r--r--arch/x86/kernel/ldt_64.c250
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c5
-rw-r--r--arch/x86/kernel/mfgpt_32.c15
-rw-r--r--arch/x86/kernel/microcode.c12
-rw-r--r--arch/x86/kernel/mpparse_32.c39
-rw-r--r--arch/x86/kernel/mpparse_64.c28
-rw-r--r--arch/x86/kernel/nmi_32.c14
-rw-r--r--arch/x86/kernel/nmi_64.c99
-rw-r--r--arch/x86/kernel/numaq_32.c2
-rw-r--r--arch/x86/kernel/paravirt.c (renamed from arch/x86/kernel/paravirt_32.c)96
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c49
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c57
-rw-r--r--arch/x86/kernel/pci-calgary_64.c5
-rw-r--r--arch/x86/kernel/pci-dma_64.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c510
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c1
-rw-r--r--arch/x86/kernel/pmtimer_64.c4
-rw-r--r--arch/x86/kernel/process_32.c419
-rw-r--r--arch/x86/kernel/process_64.c342
-rw-r--r--arch/x86/kernel/ptrace.c1545
-rw-r--r--arch/x86/kernel/ptrace_32.c717
-rw-r--r--arch/x86/kernel/ptrace_64.c621
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c (renamed from arch/x86/kernel/reboot_32.c)284
-rw-r--r--arch/x86/kernel/reboot_64.c176
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c14
-rw-r--r--arch/x86/kernel/rtc.c204
-rw-r--r--arch/x86/kernel/setup64.c59
-rw-r--r--arch/x86/kernel/setup_32.c285
-rw-r--r--arch/x86/kernel/setup_64.c555
-rw-r--r--arch/x86/kernel/signal_32.c225
-rw-r--r--arch/x86/kernel/signal_64.c133
-rw-r--r--arch/x86/kernel/smp_32.c15
-rw-r--r--arch/x86/kernel/smp_64.c91
-rw-r--r--arch/x86/kernel/smpboot_32.c61
-rw-r--r--arch/x86/kernel/smpboot_64.c79
-rw-r--r--arch/x86/kernel/smpcommon_32.c7
-rw-r--r--arch/x86/kernel/srat_32.c8
-rw-r--r--arch/x86/kernel/stacktrace.c12
-rw-r--r--arch/x86/kernel/step.c203
-rw-r--r--arch/x86/kernel/suspend_64.c30
-rw-r--r--arch/x86/kernel/suspend_asm_64.S32
-rw-r--r--arch/x86/kernel/sys_x86_64.c98
-rw-r--r--arch/x86/kernel/test_nx.c176
-rw-r--r--arch/x86/kernel/test_rodata.c86
-rw-r--r--arch/x86/kernel/time_32.c114
-rw-r--r--arch/x86/kernel/time_64.c187
-rw-r--r--arch/x86/kernel/tls.c213
-rw-r--r--arch/x86/kernel/tls.h21
-rw-r--r--arch/x86/kernel/topology.c23
-rw-r--r--arch/x86/kernel/traps_32.c341
-rw-r--r--arch/x86/kernel/traps_64.c367
-rw-r--r--arch/x86/kernel/tsc_32.c62
-rw-r--r--arch/x86/kernel/tsc_64.c100
-rw-r--r--arch/x86/kernel/tsc_sync.c30
-rw-r--r--arch/x86/kernel/vm86_32.c115
-rw-r--r--arch/x86/kernel/vmi_32.c126
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S8
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S23
-rw-r--r--arch/x86/kernel/vsmp_64.c11
-rw-r--r--arch/x86/kernel/vsyscall_32.S15
-rw-r--r--arch/x86/kernel/vsyscall_32.lds.S67
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c13
-rw-r--r--arch/x86/kvm/Kconfig (renamed from drivers/kvm/Kconfig)7
-rw-r--r--arch/x86/kvm/Makefile (renamed from drivers/kvm/Makefile)6
-rw-r--r--arch/x86/kvm/i8259.c (renamed from drivers/kvm/i8259.c)8
-rw-r--r--arch/x86/kvm/irq.c (renamed from drivers/kvm/irq.c)22
-rw-r--r--arch/x86/kvm/irq.h88
-rw-r--r--arch/x86/kvm/kvm_svm.h (renamed from drivers/kvm/kvm_svm.h)2
-rw-r--r--arch/x86/kvm/lapic.c (renamed from drivers/kvm/lapic.c)216
-rw-r--r--arch/x86/kvm/lapic.h50
-rw-r--r--arch/x86/kvm/mmu.c1885
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h484
-rw-r--r--arch/x86/kvm/segment_descriptor.h (renamed from drivers/kvm/segment_descriptor.h)12
-rw-r--r--arch/x86/kvm/svm.c (renamed from drivers/kvm/svm.c)355
-rw-r--r--arch/x86/kvm/svm.h (renamed from drivers/kvm/svm.h)3
-rw-r--r--arch/x86/kvm/vmx.c (renamed from drivers/kvm/vmx.c)1085
-rw-r--r--arch/x86/kvm/vmx.h (renamed from drivers/kvm/vmx.h)26
-rw-r--r--arch/x86/kvm/x86.c (renamed from drivers/kvm/kvm_main.c)4243
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c51
-rw-r--r--arch/x86/lib/Makefile26
-rw-r--r--arch/x86/lib/Makefile_3211
-rw-r--r--arch/x86/lib/Makefile_6413
-rw-r--r--arch/x86/lib/memcpy_32.c4
-rw-r--r--arch/x86/lib/memmove_64.c4
-rw-r--r--arch/x86/lib/semaphore_32.S22
-rw-r--r--arch/x86/lib/thunk_64.S2
-rw-r--r--arch/x86/mach-rdc321x/Makefile5
-rw-r--r--arch/x86/mach-rdc321x/gpio.c91
-rw-r--r--arch/x86/mach-rdc321x/platform.c68
-rw-r--r--arch/x86/mach-rdc321x/wdt.c275
-rw-r--r--arch/x86/mach-visws/mpparse.c16
-rw-r--r--arch/x86/mach-voyager/setup.c34
-rw-r--r--arch/x86/mach-voyager/voyager_basic.c132
-rw-r--r--arch/x86/mach-voyager/voyager_cat.c601
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c690
-rw-r--r--arch/x86/mach-voyager/voyager_thread.c52
-rw-r--r--arch/x86/math-emu/errors.c882
-rw-r--r--arch/x86/math-emu/exception.h9
-rw-r--r--arch/x86/math-emu/fpu_arith.c150
-rw-r--r--arch/x86/math-emu/fpu_asm.h1
-rw-r--r--arch/x86/math-emu/fpu_aux.c211
-rw-r--r--arch/x86/math-emu/fpu_emu.h67
-rw-r--r--arch/x86/math-emu/fpu_entry.c1230
-rw-r--r--arch/x86/math-emu/fpu_etc.c185
-rw-r--r--arch/x86/math-emu/fpu_proto.h28
-rw-r--r--arch/x86/math-emu/fpu_tags.c94
-rw-r--r--arch/x86/math-emu/fpu_trig.c2884
-rw-r--r--arch/x86/math-emu/get_address.c650
-rw-r--r--arch/x86/math-emu/load_store.c448
-rw-r--r--arch/x86/math-emu/poly.h69
-rw-r--r--arch/x86/math-emu/poly_2xm1.c199
-rw-r--r--arch/x86/math-emu/poly_atan.c353
-rw-r--r--arch/x86/math-emu/poly_l2.c376
-rw-r--r--arch/x86/math-emu/poly_sin.c599
-rw-r--r--arch/x86/math-emu/poly_tan.c338
-rw-r--r--arch/x86/math-emu/reg_add_sub.c563
-rw-r--r--arch/x86/math-emu/reg_compare.c567
-rw-r--r--arch/x86/math-emu/reg_constant.c73
-rw-r--r--arch/x86/math-emu/reg_convert.c57
-rw-r--r--arch/x86/math-emu/reg_divide.c301
-rw-r--r--arch/x86/math-emu/reg_ld_str.c2147
-rw-r--r--arch/x86/math-emu/reg_mul.c163
-rw-r--r--arch/x86/math-emu/status_w.h8
-rw-r--r--arch/x86/mm/Makefile_323
-rw-r--r--arch/x86/mm/Makefile_643
-rw-r--r--arch/x86/mm/boot_ioremap_32.c100
-rw-r--r--arch/x86/mm/discontig_32.c110
-rw-r--r--arch/x86/mm/extable.c62
-rw-r--r--arch/x86/mm/extable_32.c35
-rw-r--r--arch/x86/mm/extable_64.c34
-rw-r--r--arch/x86/mm/fault.c986
-rw-r--r--arch/x86/mm/fault_32.c659
-rw-r--r--arch/x86/mm/fault_64.c623
-rw-r--r--arch/x86/mm/highmem_32.c47
-rw-r--r--arch/x86/mm/hugetlbpage.c3
-rw-r--r--arch/x86/mm/init_32.c425
-rw-r--r--arch/x86/mm/init_64.c418
-rw-r--r--arch/x86/mm/ioremap.c501
-rw-r--r--arch/x86/mm/ioremap_32.c274
-rw-r--r--arch/x86/mm/ioremap_64.c210
-rw-r--r--arch/x86/mm/k8topology_64.c173
-rw-r--r--arch/x86/mm/mmap.c (renamed from arch/x86/mm/mmap_32.c)86
-rw-r--r--arch/x86/mm/mmap_64.c29
-rw-r--r--arch/x86/mm/numa_64.c274
-rw-r--r--arch/x86/mm/pageattr-test.c224
-rw-r--r--arch/x86/mm/pageattr.c564
-rw-r--r--arch/x86/mm/pageattr_32.c278
-rw-r--r--arch/x86/mm/pageattr_64.c255
-rw-r--r--arch/x86/mm/pgtable_32.c145
-rw-r--r--arch/x86/mm/srat_64.c95
-rw-r--r--arch/x86/oprofile/backtrace.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c212
-rw-r--r--arch/x86/pci/common.c17
-rw-r--r--arch/x86/pci/fixup.c30
-rw-r--r--arch/x86/pci/irq.c20
-rw-r--r--arch/x86/power/cpu.c18
-rw-r--r--arch/x86/vdso/.gitignore5
-rw-r--r--arch/x86/vdso/Makefile132
-rw-r--r--arch/x86/vdso/vclock_gettime.c1
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S64
-rw-r--r--arch/x86/vdso/vdso-start.S2
-rw-r--r--arch/x86/vdso/vdso.lds.S94
-rw-r--r--arch/x86/vdso/vdso32-setup.c (renamed from arch/x86/kernel/sysenter_32.c)164
-rw-r--r--arch/x86/vdso/vdso32.S19
-rw-r--r--arch/x86/vdso/vdso32/.gitignore1
-rw-r--r--arch/x86/vdso/vdso32/int80.S (renamed from arch/x86/kernel/vsyscall-int80_32.S)21
-rw-r--r--arch/x86/vdso/vdso32/note.S (renamed from arch/x86/kernel/vsyscall-note_32.S)5
-rw-r--r--arch/x86/vdso/vdso32/sigreturn.S (renamed from arch/x86/kernel/vsyscall-sigreturn_32.S)87
-rw-r--r--arch/x86/vdso/vdso32/syscall.S (renamed from arch/x86/ia32/vsyscall-syscall.S)22
-rw-r--r--arch/x86/vdso/vdso32/sysenter.S (renamed from arch/x86/kernel/vsyscall-sysenter_32.S)42
-rw-r--r--arch/x86/vdso/vdso32/vdso32.lds.S37
-rw-r--r--arch/x86/vdso/vgetcpu.c4
-rw-r--r--arch/x86/vdso/vma.c18
-rw-r--r--arch/x86/vdso/voffset.h1
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c102
-rw-r--r--arch/x86/xen/events.c4
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/smp.c8
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-head.S6
-rw-r--r--block/bsg.c1
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile3
-rw-r--r--drivers/acpi/processor_idle.c34
-rw-r--r--drivers/base/bus.c41
-rw-r--r--drivers/base/class.c4
-rw-r--r--drivers/base/core.c30
-rw-r--r--drivers/char/agp/ali-agp.c2
-rw-r--r--drivers/char/agp/backend.c3
-rw-r--r--drivers/char/agp/generic.c3
-rw-r--r--drivers/char/agp/i460-agp.c2
-rw-r--r--drivers/char/agp/intel-agp.c11
-rw-r--r--drivers/char/hpet.c126
-rw-r--r--drivers/char/rtc.c253
-rw-r--r--drivers/cpufreq/cpufreq.c2
-rw-r--r--drivers/firmware/dmi_scan.c26
-rw-r--r--drivers/ieee1394/Makefile1
-rw-r--r--drivers/ieee1394/init_ohci1394_dma.c285
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c1
-rw-r--r--drivers/input/mouse/pc110pad.c7
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--drivers/lguest/x86/core.c2
-rw-r--r--drivers/net/Kconfig5
-rw-r--r--drivers/net/e1000/e1000_main.c60
-rw-r--r--drivers/pnp/pnpbios/bioscalls.c5
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c4
-rw-r--r--drivers/scsi/3w-9xxx.c1
-rw-r--r--drivers/scsi/3w-xxxx.c1
-rw-r--r--drivers/scsi/BusLogic.c1
-rw-r--r--drivers/scsi/Kconfig2
-rw-r--r--drivers/scsi/NCR53c406a.c1
-rw-r--r--drivers/scsi/a100u2w.c1
-rw-r--r--drivers/scsi/aacraid/commctrl.c29
-rw-r--r--drivers/scsi/aacraid/linit.c1
-rw-r--r--drivers/scsi/aha1740.c1
-rw-r--r--drivers/scsi/aic7xxx/aic79xx.h5
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_core.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c3
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx.h4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_core.c3
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c10
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx_old.c1
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c1
-rw-r--r--drivers/scsi/dc395x.c1
-rw-r--r--drivers/scsi/dpt_i2o.c1
-rw-r--r--drivers/scsi/eata.c1
-rw-r--r--drivers/scsi/hosts.c1
-rw-r--r--drivers/scsi/hptiop.c3
-rw-r--r--drivers/scsi/ibmmca.c1
-rw-r--r--drivers/scsi/ibmvscsi/ibmvscsi.c1
-rw-r--r--drivers/scsi/initio.c1
-rw-r--r--drivers/scsi/iscsi_tcp.c1
-rw-r--r--drivers/scsi/libsrp.c4
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/mac53c94.c1
-rw-r--r--drivers/scsi/megaraid.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_mbox.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_sas.c1
-rw-r--r--drivers/scsi/mesh.c1
-rw-r--r--drivers/scsi/ncr53c8xx.c2
-rw-r--r--drivers/scsi/nsp32.c1
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c1
-rw-r--r--drivers/scsi/qla1280.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c1
-rw-r--r--drivers/scsi/qlogicfas.c1
-rw-r--r--drivers/scsi/scsi.c2
-rw-r--r--drivers/scsi/scsi_debug.c174
-rw-r--r--drivers/scsi/scsi_error.c33
-rw-r--r--drivers/scsi/scsi_lib.c274
-rw-r--r--drivers/scsi/scsi_tgt_lib.c28
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--drivers/scsi/sgiwd93.c64
-rw-r--r--drivers/scsi/sr.c25
-rw-r--r--drivers/scsi/stex.c1
-rw-r--r--drivers/scsi/sym53c416.c1
-rw-r--r--drivers/scsi/sym53c8xx_2/sym_glue.c3
-rw-r--r--drivers/scsi/u14-34f.c1
-rw-r--r--drivers/scsi/ultrastor.c1
-rw-r--r--drivers/scsi/wd7000.c1
-rw-r--r--drivers/usb/storage/isd200.c8
-rw-r--r--drivers/video/vermilion/vermilion.c15
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/aio.c2
-rw-r--r--fs/binfmt_elf.c677
-rw-r--r--fs/compat_binfmt_elf.c131
-rw-r--r--fs/dlm/dir.c76
-rw-r--r--fs/dlm/dlm_internal.h16
-rw-r--r--fs/dlm/lock.c249
-rw-r--r--fs/dlm/lock.h2
-rw-r--r--fs/dlm/lockspace.c16
-rw-r--r--fs/dlm/lowcomms.c15
-rw-r--r--fs/dlm/main.c10
-rw-r--r--fs/dlm/member.c4
-rw-r--r--fs/dlm/member.h3
-rw-r--r--fs/dlm/memory.c32
-rw-r--r--fs/dlm/memory.h16
-rw-r--r--fs/dlm/midcomms.c15
-rw-r--r--fs/dlm/rcom.c25
-rw-r--r--fs/dlm/recover.c27
-rw-r--r--fs/dlm/recoverd.c11
-rw-r--r--fs/dlm/user.c29
-rw-r--r--fs/dlm/util.c82
-rw-r--r--fs/jbd/checkpoint.c3
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c2
-rw-r--r--include/acpi/reboot.h9
-rw-r--r--include/asm-alpha/agp.h1
-rw-r--r--include/asm-generic/bug.h17
-rw-r--r--include/asm-generic/percpu.h97
-rw-r--r--include/asm-generic/tlb.h1
-rw-r--r--include/asm-generic/vmlinux.lds.h1
-rw-r--r--include/asm-ia64/acpi.h2
-rw-r--r--include/asm-ia64/agp.h1
-rw-r--r--include/asm-ia64/percpu.h24
-rw-r--r--include/asm-m32r/signal.h2
-rw-r--r--include/asm-parisc/agp.h1
-rw-r--r--include/asm-powerpc/agp.h1
-rw-r--r--include/asm-powerpc/percpu.h17
-rw-r--r--include/asm-powerpc/ptrace.h7
-rw-r--r--include/asm-s390/percpu.h20
-rw-r--r--include/asm-sparc64/agp.h1
-rw-r--r--include/asm-sparc64/percpu.h16
-rw-r--r--include/asm-um/asm.h6
-rw-r--r--include/asm-um/linkage.h1
-rw-r--r--include/asm-um/nops.h6
-rw-r--r--include/asm-x86/Kbuild5
-rw-r--r--include/asm-x86/acpi.h151
-rw-r--r--include/asm-x86/acpi_32.h143
-rw-r--r--include/asm-x86/acpi_64.h153
-rw-r--r--include/asm-x86/agp.h9
-rw-r--r--include/asm-x86/alternative.h162
-rw-r--r--include/asm-x86/alternative_32.h154
-rw-r--r--include/asm-x86/alternative_64.h159
-rw-r--r--include/asm-x86/apic.h141
-rw-r--r--include/asm-x86/apic_32.h127
-rw-r--r--include/asm-x86/apic_64.h102
-rw-r--r--include/asm-x86/apicdef.h412
-rw-r--r--include/asm-x86/apicdef_32.h375
-rw-r--r--include/asm-x86/apicdef_64.h392
-rw-r--r--include/asm-x86/arch_hooks.h5
-rw-r--r--include/asm-x86/asm.h32
-rw-r--r--include/asm-x86/bitops.h316
-rw-r--r--include/asm-x86/bitops_32.h324
-rw-r--r--include/asm-x86/bitops_64.h297
-rw-r--r--include/asm-x86/bootparam.h5
-rw-r--r--include/asm-x86/bug.h3
-rw-r--r--include/asm-x86/bugs.h3
-rw-r--r--include/asm-x86/cacheflush.h35
-rw-r--r--include/asm-x86/calling.h194
-rw-r--r--include/asm-x86/checksum_64.h2
-rw-r--r--include/asm-x86/cmpxchg_32.h122
-rw-r--r--include/asm-x86/compat.h2
-rw-r--r--include/asm-x86/cpu.h2
-rw-r--r--include/asm-x86/cpufeature.h208
-rw-r--r--include/asm-x86/cpufeature_32.h176
-rw-r--r--include/asm-x86/cpufeature_64.h30
-rw-r--r--include/asm-x86/desc.h380
-rw-r--r--include/asm-x86/desc_32.h244
-rw-r--r--include/asm-x86/desc_64.h203
-rw-r--r--include/asm-x86/desc_defs.h47
-rw-r--r--include/asm-x86/dma.h318
-rw-r--r--include/asm-x86/dma_32.h297
-rw-r--r--include/asm-x86/dma_64.h304
-rw-r--r--include/asm-x86/dmi.h10
-rw-r--r--include/asm-x86/ds.h72
-rw-r--r--include/asm-x86/e820.h6
-rw-r--r--include/asm-x86/e820_32.h9
-rw-r--r--include/asm-x86/e820_64.h16
-rw-r--r--include/asm-x86/efi.h97
-rw-r--r--include/asm-x86/elf.h212
-rw-r--r--include/asm-x86/emergency-restart.h12
-rw-r--r--include/asm-x86/fixmap_32.h24
-rw-r--r--include/asm-x86/fixmap_64.h6
-rw-r--r--include/asm-x86/fpu32.h10
-rw-r--r--include/asm-x86/futex.h138
-rw-r--r--include/asm-x86/futex_32.h135
-rw-r--r--include/asm-x86/futex_64.h125
-rw-r--r--include/asm-x86/gart.h5
-rw-r--r--include/asm-x86/geode.h12
-rw-r--r--include/asm-x86/gpio.h6
-rw-r--r--include/asm-x86/hpet.h8
-rw-r--r--include/asm-x86/hw_irq_32.h16
-rw-r--r--include/asm-x86/hw_irq_64.h2
-rw-r--r--include/asm-x86/i387.h361
-rw-r--r--include/asm-x86/i387_32.h151
-rw-r--r--include/asm-x86/i387_64.h214
-rw-r--r--include/asm-x86/i8253.h3
-rw-r--r--include/asm-x86/i8259.h20
-rw-r--r--include/asm-x86/ia32.h6
-rw-r--r--include/asm-x86/ia32_unistd.h2
-rw-r--r--include/asm-x86/ide.h2
-rw-r--r--include/asm-x86/idle.h1
-rw-r--r--include/asm-x86/io_32.h35
-rw-r--r--include/asm-x86/io_64.h57
-rw-r--r--include/asm-x86/io_apic.h158
-rw-r--r--include/asm-x86/io_apic_32.h155
-rw-r--r--include/asm-x86/io_apic_64.h138
-rw-r--r--include/asm-x86/irqflags.h246
-rw-r--r--include/asm-x86/irqflags_32.h197
-rw-r--r--include/asm-x86/irqflags_64.h176
-rw-r--r--include/asm-x86/k8.h1
-rw-r--r--include/asm-x86/kdebug.h11
-rw-r--r--include/asm-x86/kexec.h169
-rw-r--r--include/asm-x86/kexec_32.h99
-rw-r--r--include/asm-x86/kexec_64.h94
-rw-r--r--include/asm-x86/kprobes.h103
-rw-r--r--include/asm-x86/kprobes_32.h94
-rw-r--r--include/asm-x86/kprobes_64.h90
-rw-r--r--include/asm-x86/kvm.h191
-rw-r--r--include/asm-x86/kvm_host.h (renamed from drivers/kvm/kvm.h)537
-rw-r--r--include/asm-x86/kvm_para.h105
-rw-r--r--include/asm-x86/kvm_x86_emulate.h (renamed from drivers/kvm/x86_emulate.h)69
-rw-r--r--include/asm-x86/lguest.h14
-rw-r--r--include/asm-x86/linkage.h26
-rw-r--r--include/asm-x86/linkage_32.h15
-rw-r--r--include/asm-x86/linkage_64.h6
-rw-r--r--include/asm-x86/local.h243
-rw-r--r--include/asm-x86/local_32.h233
-rw-r--r--include/asm-x86/local_64.h222
-rw-r--r--include/asm-x86/mach-bigsmp/mach_apic.h12
-rw-r--r--include/asm-x86/mach-default/apm.h2
-rw-r--r--include/asm-x86/mach-default/io_ports.h25
-rw-r--r--include/asm-x86/mach-default/mach_apic.h18
-rw-r--r--include/asm-x86/mach-default/mach_time.h111
-rw-r--r--include/asm-x86/mach-default/mach_timer.h2
-rw-r--r--include/asm-x86/mach-default/mach_traps.h2
-rw-r--r--include/asm-x86/mach-es7000/mach_apic.h10
-rw-r--r--include/asm-x86/mach-generic/gpio.h15
-rw-r--r--include/asm-x86/mach-numaq/mach_apic.h10
-rw-r--r--include/asm-x86/mach-rdc321x/gpio.h56
-rw-r--r--include/asm-x86/mach-rdc321x/rdc321x_defs.h6
-rw-r--r--include/asm-x86/mach-summit/mach_apic.h18
-rw-r--r--include/asm-x86/math_emu.h5
-rw-r--r--include/asm-x86/mc146818rtc.h101
-rw-r--r--include/asm-x86/mc146818rtc_32.h97
-rw-r--r--include/asm-x86/mc146818rtc_64.h29
-rw-r--r--include/asm-x86/mce.h18
-rw-r--r--include/asm-x86/mmsegment.h8
-rw-r--r--include/asm-x86/mmu.h8
-rw-r--r--include/asm-x86/mmu_context_32.h2
-rw-r--r--include/asm-x86/mmu_context_64.h13
-rw-r--r--include/asm-x86/mmzone_32.h3
-rw-r--r--include/asm-x86/mmzone_64.h12
-rw-r--r--include/asm-x86/module.h81
-rw-r--r--include/asm-x86/module_32.h75
-rw-r--r--include/asm-x86/module_64.h10
-rw-r--r--include/asm-x86/mpspec.h116
-rw-r--r--include/asm-x86/mpspec_32.h81
-rw-r--r--include/asm-x86/mpspec_64.h233
-rw-r--r--include/asm-x86/mpspec_def.h87
-rw-r--r--include/asm-x86/msr-index.h15
-rw-r--r--include/asm-x86/msr.h299
-rw-r--r--include/asm-x86/mtrr.h14
-rw-r--r--include/asm-x86/mutex_32.h7
-rw-r--r--include/asm-x86/nmi_32.h3
-rw-r--r--include/asm-x86/nmi_64.h5
-rw-r--r--include/asm-x86/nops.h90
-rw-r--r--include/asm-x86/numa_32.h14
-rw-r--r--include/asm-x86/numa_64.h12
-rw-r--r--include/asm-x86/page.h190
-rw-r--r--include/asm-x86/page_32.h247
-rw-r--r--include/asm-x86/page_64.h110
-rw-r--r--include/asm-x86/paravirt.h615
-rw-r--r--include/asm-x86/pci.h17
-rw-r--r--include/asm-x86/pci_64.h1
-rw-r--r--include/asm-x86/pda.h73
-rw-r--r--include/asm-x86/percpu.h145
-rw-r--r--include/asm-x86/percpu_32.h154
-rw-r--r--include/asm-x86/percpu_64.h68
-rw-r--r--include/asm-x86/pgalloc_32.h85
-rw-r--r--include/asm-x86/pgtable-2level.h45
-rw-r--r--include/asm-x86/pgtable-3level.h88
-rw-r--r--include/asm-x86/pgtable.h359
-rw-r--r--include/asm-x86/pgtable_32.h291
-rw-r--r--include/asm-x86/pgtable_64.h260
-rw-r--r--include/asm-x86/processor.h841
-rw-r--r--include/asm-x86/processor_32.h786
-rw-r--r--include/asm-x86/processor_64.h452
-rw-r--r--include/asm-x86/proto.h69
-rw-r--r--include/asm-x86/ptrace-abi.h62
-rw-r--r--include/asm-x86/ptrace.h220
-rw-r--r--include/asm-x86/resume-trace.h23
-rw-r--r--include/asm-x86/resume-trace_32.h13
-rw-r--r--include/asm-x86/resume-trace_64.h13
-rw-r--r--include/asm-x86/rio.h4
-rw-r--r--include/asm-x86/rwlock.h1
-rw-r--r--include/asm-x86/rwsem.h14
-rw-r--r--include/asm-x86/scatterlist.h34
-rw-r--r--include/asm-x86/scatterlist_32.h28
-rw-r--r--include/asm-x86/scatterlist_64.h29
-rw-r--r--include/asm-x86/segment.h203
-rw-r--r--include/asm-x86/segment_32.h148
-rw-r--r--include/asm-x86/segment_64.h53
-rw-r--r--include/asm-x86/semaphore_32.h8
-rw-r--r--include/asm-x86/setup.h11
-rw-r--r--include/asm-x86/sigcontext.h42
-rw-r--r--include/asm-x86/sigcontext32.h22
-rw-r--r--include/asm-x86/signal.h11
-rw-r--r--include/asm-x86/smp_32.h119
-rw-r--r--include/asm-x86/smp_64.h137
-rw-r--r--include/asm-x86/sparsemem.h35
-rw-r--r--include/asm-x86/sparsemem_32.h31
-rw-r--r--include/asm-x86/sparsemem_64.h26
-rw-r--r--include/asm-x86/spinlock.h295
-rw-r--r--include/asm-x86/spinlock_32.h221
-rw-r--r--include/asm-x86/spinlock_64.h167
-rw-r--r--include/asm-x86/spinlock_types.h2
-rw-r--r--include/asm-x86/stacktrace.h5
-rw-r--r--include/asm-x86/suspend_32.h4
-rw-r--r--include/asm-x86/suspend_64.h11
-rw-r--r--include/asm-x86/system.h413
-rw-r--r--include/asm-x86/system_32.h320
-rw-r--r--include/asm-x86/system_64.h163
-rw-r--r--include/asm-x86/thread_info_32.h16
-rw-r--r--include/asm-x86/thread_info_64.h34
-rw-r--r--include/asm-x86/time.h26
-rw-r--r--include/asm-x86/timer.h23
-rw-r--r--include/asm-x86/timex.h2
-rw-r--r--include/asm-x86/tlbflush.h157
-rw-r--r--include/asm-x86/tlbflush_32.h168
-rw-r--r--include/asm-x86/tlbflush_64.h100
-rw-r--r--include/asm-x86/topology.h187
-rw-r--r--include/asm-x86/topology_32.h121
-rw-r--r--include/asm-x86/topology_64.h71
-rw-r--r--include/asm-x86/tsc.h40
-rw-r--r--include/asm-x86/uaccess_64.h2
-rw-r--r--include/asm-x86/unistd_32.h2
-rw-r--r--include/asm-x86/user_32.h24
-rw-r--r--include/asm-x86/user_64.h41
-rw-r--r--include/asm-x86/vdso.h28
-rw-r--r--include/asm-x86/vsyscall.h2
-rw-r--r--include/asm-x86/vsyscall32.h20
-rw-r--r--include/asm-x86/xor_32.h2
-rw-r--r--include/asm-x86/xor_64.h2
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/acpi_pmtmr.h2
-rw-r--r--include/linux/audit.h2
-rw-r--r--include/linux/clocksource.h3
-rw-r--r--include/linux/compat.h15
-rw-r--r--include/linux/const.h5
-rw-r--r--include/linux/cpumask.h4
-rw-r--r--include/linux/device.h3
-rw-r--r--include/linux/elf.h1
-rw-r--r--include/linux/hpet.h5
-rw-r--r--include/linux/init_ohci1394_dma.h4
-rw-r--r--include/linux/ioport.h2
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/kprobes.h10
-rw-r--r--include/linux/kvm.h203
-rw-r--r--include/linux/kvm_host.h299
-rw-r--r--include/linux/kvm_para.h82
-rw-r--r--include/linux/kvm_types.h54
-rw-r--r--include/linux/linkage.h8
-rw-r--r--include/linux/mm.h15
-rw-r--r--include/linux/pci_ids.h7
-rw-r--r--include/linux/percpu.h24
-rw-r--r--include/linux/ptrace.h75
-rw-r--r--include/linux/regset.h368
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/selinux.h45
-rw-r--r--include/linux/smp.h2
-rw-r--r--include/linux/spinlock.h6
-rw-r--r--include/linux/spinlock_types.h4
-rw-r--r--include/linux/spinlock_up.h2
-rw-r--r--include/linux/suspend.h3
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/thread_info.h10
-rw-r--r--include/linux/tick.h6
-rw-r--r--include/linux/timer.h6
-rw-r--r--include/net/netlabel.h99
-rw-r--r--include/scsi/scsi.h20
-rw-r--r--include/scsi/scsi_cmnd.h59
-rw-r--r--include/scsi/scsi_eh.h9
-rw-r--r--include/scsi/scsi_host.h17
-rw-r--r--include/xen/page.h6
-rw-r--r--init/main.c25
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/backtracetest.c48
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/panic.c29
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/ptrace.c165
-rw-r--r--kernel/sched.c16
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/spinlock.c3
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/test_kprobes.c216
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/clocksource.c31
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c76
-rw-r--r--kernel/time/timekeeping.c28
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c82
-rw-r--r--lib/Kconfig.debug52
-rw-r--r--lib/rwsem.c8
-rw-r--r--mm/memory.c39
-rw-r--r--mm/mmap.c3
-rw-r--r--net/ipv4/cipso_ipv4.c59
-rw-r--r--net/ipv6/netfilter/Kconfig2
-rw-r--r--net/netfilter/xt_SECMARK.c13
-rw-r--r--net/netlabel/netlabel_cipso_v4.c5
-rw-r--r--net/netlabel/netlabel_domainhash.c77
-rw-r--r--net/netlabel/netlabel_kapi.c21
-rw-r--r--net/netlabel/netlabel_mgmt.c63
-rw-r--r--net/netlabel/netlabel_mgmt.h7
-rw-r--r--net/netlabel/netlabel_unlabeled.c1565
-rw-r--r--net/netlabel/netlabel_unlabeled.h145
-rw-r--r--net/sunrpc/svc.c1
-rw-r--r--security/Kconfig1
-rw-r--r--security/selinux/Kconfig2
-rw-r--r--security/selinux/Makefile9
-rw-r--r--security/selinux/avc.c15
-rw-r--r--security/selinux/exports.c20
-rw-r--r--security/selinux/hooks.c667
-rw-r--r--security/selinux/include/av_perm_to_string.h9
-rw-r--r--security/selinux/include/av_permissions.h9
-rw-r--r--security/selinux/include/avc.h2
-rw-r--r--security/selinux/include/class_to_string.h7
-rw-r--r--security/selinux/include/flask.h1
-rw-r--r--security/selinux/include/netif.h4
-rw-r--r--security/selinux/include/netlabel.h11
-rw-r--r--security/selinux/include/netnode.h32
-rw-r--r--security/selinux/include/objsec.h16
-rw-r--r--security/selinux/include/security.h24
-rw-r--r--security/selinux/include/xfrm.h12
-rw-r--r--security/selinux/netif.c263
-rw-r--r--security/selinux/netlabel.c75
-rw-r--r--security/selinux/netnode.c354
-rw-r--r--security/selinux/selinuxfs.c89
-rw-r--r--security/selinux/ss/mls.c10
-rw-r--r--security/selinux/ss/policydb.c18
-rw-r--r--security/selinux/ss/policydb.h2
-rw-r--r--security/selinux/ss/services.c291
-rw-r--r--security/selinux/xfrm.c18
-rw-r--r--sound/pci/intel8x0.c8
-rw-r--r--virt/kvm/ioapic.c (renamed from drivers/kvm/ioapic.c)99
-rw-r--r--virt/kvm/ioapic.h95
-rw-r--r--virt/kvm/iodev.h63
-rw-r--r--virt/kvm/kvm_main.c1400
801 files changed, 50404 insertions, 45797 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index aa38cc5692a0..77436d735013 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -419,7 +419,13 @@ X!Edrivers/pnp/system.c
419 419
420 <chapter id="blkdev"> 420 <chapter id="blkdev">
421 <title>Block Devices</title> 421 <title>Block Devices</title>
422!Eblock/ll_rw_blk.c 422!Eblock/blk-core.c
423!Eblock/blk-map.c
424!Iblock/blk-sysfs.c
425!Eblock/blk-settings.c
426!Eblock/blk-exec.c
427!Eblock/blk-barrier.c
428!Eblock/blk-tag.c
423 </chapter> 429 </chapter>
424 430
425 <chapter id="chrdev"> 431 <chapter id="chrdev">
diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt
new file mode 100644
index 000000000000..de4804e8b396
--- /dev/null
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -0,0 +1,179 @@
1
2 Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
3 ---------------------------------------------------------------------------
4
5Introduction
6------------
7
8Basically all FireWire controllers which are in use today are compliant
9to the OHCI-1394 specification which defines the controller to be a PCI
10bus master which uses DMA to offload data transfers from the CPU and has
11a "Physical Response Unit" which executes specific requests by employing
12PCI-Bus master DMA after applying filters defined by the OHCI-1394 driver.
13
14Once properly configured, remote machines can send these requests to
15ask the OHCI-1394 controller to perform read and write requests on
16physical system memory and, for read requests, send the result of
17the physical memory read back to the requester.
18
19With that, it is possible to debug issues by reading interesting memory
20locations such as buffers like the printk buffer or the process table.
21
22Retrieving a full system memory dump is also possible over the FireWire,
23using data transfer rates in the order of 10MB/s or more.
24
25Memory access is currently limited to the low 4G of physical address
26space which can be a problem on IA64 machines where memory is located
27mostly above that limit, but it is rarely a problem on more common
28hardware such as hardware based on x86, x86-64 and PowerPC.
29
30Together with a early initialization of the OHCI-1394 controller for debugging,
31this facility proved most useful for examining long debugs logs in the printk
32buffer on to debug early boot problems in areas like ACPI where the system
33fails to boot and other means for debugging (serial port) are either not
34available (notebooks) or too slow for extensive debug information (like ACPI).
35
36Drivers
37-------
38
39The OHCI-1394 drivers in drivers/firewire and drivers/ieee1394 initialize
40the OHCI-1394 controllers to a working state and can be used to enable
41physical DMA. By default you only have to load the driver, and physical
42DMA access will be granted to all remote nodes, but it can be turned off
43when using the ohci1394 driver.
44
45Because these drivers depend on the PCI enumeration to be completed, an
46initialization routine which can runs pretty early (long before console_init(),
47which makes the printk buffer appear on the console can be called) was written.
48
49To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu:
50Provide code for enabling DMA over FireWire early on boot) and pass the
51parameter "ohci1394_dma=early" to the recompiled kernel on boot.
52
53Tools
54-----
55
56firescope - Originally developed by Benjamin Herrenschmidt, Andi Kleen ported
57it from PowerPC to x86 and x86_64 and added functionality, firescope can now
58be used to view the printk buffer of a remote machine, even with live update.
59
60Bernhard Kaindl enhanced firescope to support accessing 64-bit machines
61from 32-bit firescope and vice versa:
62- ftp://ftp.suse.de/private/bk/firewire/tools/firescope-0.2.2.tar.bz2
63
64and he implemented fast system dump (alpha version - read README.txt):
65- ftp://ftp.suse.de/private/bk/firewire/tools/firedump-0.1.tar.bz2
66
67There is also a gdb proxy for firewire which allows to use gdb to access
68data which can be referenced from symbols found by gdb in vmlinux:
69- ftp://ftp.suse.de/private/bk/firewire/tools/fireproxy-0.33.tar.bz2
70
71The latest version of this gdb proxy (fireproxy-0.34) can communicate (not
72yet stable) with kgdb over an memory-based communication module (kgdbom).
73
74Getting Started
75---------------
76
77The OHCI-1394 specification regulates that the OHCI-1394 controller must
78disable all physical DMA on each bus reset.
79
80This means that if you want to debug an issue in a system state where
81interrupts are disabled and where no polling of the OHCI-1394 controller
82for bus resets takes place, you have to establish any FireWire cable
83connections and fully initialize all FireWire hardware __before__ the
84system enters such state.
85
86Step-by-step instructions for using firescope with early OHCI initialization:
87
881) Verify that your hardware is supported:
89
90 Load the ohci1394 or the fw-ohci module and check your kernel logs.
91 You should see a line similar to
92
93 ohci1394: fw-host0: OHCI-1394 1.1 (PCI): IRQ=[18] MMIO=[fe9ff800-fe9fffff]
94 ... Max Packet=[2048] IR/IT contexts=[4/8]
95
96 when loading the driver. If you have no supported controller, many PCI,
97 CardBus and even some Express cards which are fully compliant to OHCI-1394
98 specification are available. If it requires no driver for Windows operating
99 systems, it most likely is. Only specialized shops have cards which are not
100 compliant, they are based on TI PCILynx chips and require drivers for Win-
101 dows operating systems.
102
1032) Establish a working FireWire cable connection:
104
105 Any FireWire cable, as long at it provides electrically and mechanically
106 stable connection and has matching connectors (there are small 4-pin and
107 large 6-pin FireWire ports) will do.
108
109 If an driver is running on both machines you should see a line like
110
111 ieee1394: Node added: ID:BUS[0-01:1023] GUID[0090270001b84bba]
112
113 on both machines in the kernel log when the cable is plugged in
114 and connects the two machines.
115
1163) Test physical DMA using firescope:
117
118 On the debug host,
119 - load the raw1394 module,
120 - make sure that /dev/raw1394 is accessible,
121 then start firescope:
122
123 $ firescope
124 Port 0 (ohci1394) opened, 2 nodes detected
125
126 FireScope
127 ---------
128 Target : <unspecified>
129 Gen : 1
130 [Ctrl-T] choose target
131 [Ctrl-H] this menu
132 [Ctrl-Q] quit
133
134 ------> Press Ctrl-T now, the output should be similar to:
135
136 2 nodes available, local node is: 0
137 0: ffc0, uuid: 00000000 00000000 [LOCAL]
138 1: ffc1, uuid: 00279000 ba4bb801
139
140 Besides the [LOCAL] node, it must show another node without error message.
141
1424) Prepare for debugging with early OHCI-1394 initialization:
143
144 4.1) Kernel compilation and installation on debug target
145
146 Compile the kernel to be debugged with CONFIG_PROVIDE_OHCI1394_DMA_INIT
147 (Kernel hacking: Provide code for enabling DMA over FireWire early on boot)
148 enabled and install it on the machine to be debugged (debug target).
149
150 4.2) Transfer the System.map of the debugged kernel to the debug host
151
152 Copy the System.map of the kernel be debugged to the debug host (the host
153 which is connected to the debugged machine over the FireWire cable).
154
1555) Retrieving the printk buffer contents:
156
157 With the FireWire cable connected, the OHCI-1394 driver on the debugging
158 host loaded, reboot the debugged machine, booting the kernel which has
159 CONFIG_PROVIDE_OHCI1394_DMA_INIT enabled, with the option ohci1394_dma=early.
160
161 Then, on the debugging host, run firescope, for example by using -A:
162
163 firescope -A System.map-of-debug-target-kernel
164
165 Note: -A automatically attaches to the first non-local node. It only works
166 reliably if only connected two machines are connected using FireWire.
167
168 After having attached to the debug target, press Ctrl-D to view the
169 complete printk buffer or Ctrl-U to enter auto update mode and get an
170 updated live view of recent kernel messages logged on the debug target.
171
172 Call "firescope -h" to get more information on firescope's options.
173
174Notes
175-----
176Documentation and specifications: ftp://ftp.suse.de/private/bk/firewire/docs
177
178FireWire is a trademark of Apple Inc. - for more information please refer to:
179http://en.wikipedia.org/wiki/FireWire
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 880f882160e2..5d171b7b8393 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -416,8 +416,21 @@ and is between 256 and 4096 characters. It is defined in the file
416 [SPARC64] tick 416 [SPARC64] tick
417 [X86-64] hpet,tsc 417 [X86-64] hpet,tsc
418 418
419 code_bytes [IA32] How many bytes of object code to print in an 419 clearcpuid=BITNUM [X86]
420 oops report. 420 Disable CPUID feature X for the kernel. See
421 include/asm-x86/cpufeature.h for the valid bit numbers.
422 Note the Linux specific bits are not necessarily
423 stable over kernel options, but the vendor specific
424 ones should be.
425 Also note that user programs calling CPUID directly
426 or using the feature without checking anything
427 will still see it. This just prevents it from
428 being used by the kernel or shown in /proc/cpuinfo.
429 Also note the kernel might malfunction if you disable
430 some critical bits.
431
432 code_bytes [IA32/X86_64] How many bytes of object code to print
433 in an oops report.
421 Range: 0 - 8192 434 Range: 0 - 8192
422 Default: 64 435 Default: 64
423 436
@@ -570,6 +583,12 @@ and is between 256 and 4096 characters. It is defined in the file
570 See drivers/char/README.epca and 583 See drivers/char/README.epca and
571 Documentation/digiepca.txt. 584 Documentation/digiepca.txt.
572 585
586 disable_mtrr_trim [X86, Intel and AMD only]
587 By default the kernel will trim any uncacheable
588 memory out of your available memory pool based on
589 MTRR settings. This parameter disables that behavior,
590 possibly causing your machine to run very slowly.
591
573 dmasound= [HW,OSS] Sound subsystem buffers 592 dmasound= [HW,OSS] Sound subsystem buffers
574 593
575 dscc4.setup= [NET] 594 dscc4.setup= [NET]
@@ -660,6 +679,10 @@ and is between 256 and 4096 characters. It is defined in the file
660 679
661 gamma= [HW,DRM] 680 gamma= [HW,DRM]
662 681
682 gart_fix_e820= [X86_64] disable the fix e820 for K8 GART
683 Format: off | on
684 default: on
685
663 gdth= [HW,SCSI] 686 gdth= [HW,SCSI]
664 See header of drivers/scsi/gdth.c. 687 See header of drivers/scsi/gdth.c.
665 688
@@ -794,6 +817,16 @@ and is between 256 and 4096 characters. It is defined in the file
794 for translation below 32 bit and if not available 817 for translation below 32 bit and if not available
795 then look in the higher range. 818 then look in the higher range.
796 819
820 io_delay= [X86-32,X86-64] I/O delay method
821 0x80
822 Standard port 0x80 based delay
823 0xed
824 Alternate port 0xed based delay (needed on some systems)
825 udelay
826 Simple two microseconds delay
827 none
828 No delay
829
797 io7= [HW] IO7 for Marvel based alpha systems 830 io7= [HW] IO7 for Marvel based alpha systems
798 See comment before marvel_specify_io7 in 831 See comment before marvel_specify_io7 in
799 arch/alpha/kernel/core_marvel.c. 832 arch/alpha/kernel/core_marvel.c.
@@ -1059,6 +1092,11 @@ and is between 256 and 4096 characters. It is defined in the file
1059 Multi-Function General Purpose Timers on AMD Geode 1092 Multi-Function General Purpose Timers on AMD Geode
1060 platforms. 1093 platforms.
1061 1094
1095 mfgptfix [X86-32] Fix MFGPT timers on AMD Geode platforms when
1096 the BIOS has incorrectly applied a workaround. TinyBIOS
1097 version 0.98 is known to be affected, 0.99 fixes the
1098 problem by letting the user disable the workaround.
1099
1062 mga= [HW,DRM] 1100 mga= [HW,DRM]
1063 1101
1064 mousedev.tap_time= 1102 mousedev.tap_time=
@@ -1159,6 +1197,8 @@ and is between 256 and 4096 characters. It is defined in the file
1159 1197
1160 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. 1198 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
1161 1199
1200 noefi [X86-32,X86-64] Disable EFI runtime services support.
1201
1162 noexec [IA-64] 1202 noexec [IA-64]
1163 1203
1164 noexec [X86-32,X86-64] 1204 noexec [X86-32,X86-64]
@@ -1169,6 +1209,8 @@ and is between 256 and 4096 characters. It is defined in the file
1169 register save and restore. The kernel will only save 1209 register save and restore. The kernel will only save
1170 legacy floating-point registers on task switch. 1210 legacy floating-point registers on task switch.
1171 1211
1212 noclflush [BUGS=X86] Don't use the CLFLUSH instruction
1213
1172 nohlt [BUGS=ARM] 1214 nohlt [BUGS=ARM]
1173 1215
1174 no-hlt [BUGS=X86-32] Tells the kernel that the hlt 1216 no-hlt [BUGS=X86-32] Tells the kernel that the hlt
@@ -1978,6 +2020,11 @@ and is between 256 and 4096 characters. It is defined in the file
1978 vdso=1: enable VDSO (default) 2020 vdso=1: enable VDSO (default)
1979 vdso=0: disable VDSO mapping 2021 vdso=0: disable VDSO mapping
1980 2022
2023 vdso32= [X86-32,X86-64]
2024 vdso32=2: enable compat VDSO (default with COMPAT_VDSO)
2025 vdso32=1: enable 32-bit VDSO (default)
2026 vdso32=0: disable 32-bit VDSO mapping
2027
1981 vector= [IA-64,SMP] 2028 vector= [IA-64,SMP]
1982 vector=percpu: enable percpu vector domain 2029 vector=percpu: enable percpu vector domain
1983 2030
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 945311840a10..34abae4e9442 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -110,12 +110,18 @@ Idle loop
110 110
111Rebooting 111Rebooting
112 112
113 reboot=b[ios] | t[riple] | k[bd] [, [w]arm | [c]old] 113 reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old]
114 bios Use the CPU reboot vector for warm reset 114 bios Use the CPU reboot vector for warm reset
115 warm Don't set the cold reboot flag 115 warm Don't set the cold reboot flag
116 cold Set the cold reboot flag 116 cold Set the cold reboot flag
117 triple Force a triple fault (init) 117 triple Force a triple fault (init)
118 kbd Use the keyboard controller. cold reset (default) 118 kbd Use the keyboard controller. cold reset (default)
119 acpi Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the
120 ACPI reset does not work, the reboot path attempts the reset using
121 the keyboard controller.
122 efi Use efi reset_system runtime service. If EFI is not configured or the
123 EFI reset does not work, the reboot path attempts the reset using
124 the keyboard controller.
119 125
120 Using warm reset will be much faster especially on big memory 126 Using warm reset will be much faster especially on big memory
121 systems because the BIOS will not go through the memory check. 127 systems because the BIOS will not go through the memory check.
diff --git a/Documentation/x86_64/uefi.txt b/Documentation/x86_64/uefi.txt
index 91a98edfb588..7d77120a5184 100644
--- a/Documentation/x86_64/uefi.txt
+++ b/Documentation/x86_64/uefi.txt
@@ -19,6 +19,10 @@ Mechanics:
19- Build the kernel with the following configuration. 19- Build the kernel with the following configuration.
20 CONFIG_FB_EFI=y 20 CONFIG_FB_EFI=y
21 CONFIG_FRAMEBUFFER_CONSOLE=y 21 CONFIG_FRAMEBUFFER_CONSOLE=y
22 If EFI runtime services are expected, the following configuration should
23 be selected.
24 CONFIG_EFI=y
25 CONFIG_EFI_VARS=y or m # optional
22- Create a VFAT partition on the disk 26- Create a VFAT partition on the disk
23- Copy the following to the VFAT partition: 27- Copy the following to the VFAT partition:
24 elilo bootloader with x86_64 support, elilo configuration file, 28 elilo bootloader with x86_64 support, elilo configuration file,
@@ -27,3 +31,8 @@ Mechanics:
27 can be found in the elilo sourceforge project. 31 can be found in the elilo sourceforge project.
28- Boot to EFI shell and invoke elilo choosing the kernel image built 32- Boot to EFI shell and invoke elilo choosing the kernel image built
29 in first step. 33 in first step.
34- If some or all EFI runtime services don't work, you can try following
35 kernel command line parameters to turn off some or all EFI runtime
36 services.
37 noefi turn off all EFI runtime services
38 reboot_type=k turn off EFI reboot runtime service
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index de211ac3853e..77201d3f7479 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -91,6 +91,11 @@ config GENERIC_IRQ_PROBE
91 bool 91 bool
92 default y 92 default y
93 93
94config GENERIC_LOCKBREAK
95 bool
96 default y
97 depends on SMP && PREEMPT
98
94config RWSEM_GENERIC_SPINLOCK 99config RWSEM_GENERIC_SPINLOCK
95 bool 100 bool
96 default y 101 default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bef47725d4ad..5a41e75ae1fe 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -42,6 +42,11 @@ config MMU
42config SWIOTLB 42config SWIOTLB
43 bool 43 bool
44 44
45config GENERIC_LOCKBREAK
46 bool
47 default y
48 depends on SMP && PREEMPT
49
45config RWSEM_XCHGADD_ALGORITHM 50config RWSEM_XCHGADD_ALGORITHM
46 bool 51 bool
47 default y 52 default y
@@ -75,6 +80,9 @@ config GENERIC_TIME_VSYSCALL
75 bool 80 bool
76 default y 81 default y
77 82
83config ARCH_SETS_UP_PER_CPU_AREA
84 def_bool y
85
78config DMI 86config DMI
79 bool 87 bool
80 default y 88 default y
diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c
index 6ef9b5219930..7661bb065fa5 100644
--- a/arch/ia64/hp/sim/simscsi.c
+++ b/arch/ia64/hp/sim/simscsi.c
@@ -360,7 +360,6 @@ static struct scsi_host_template driver_template = {
360 .max_sectors = 1024, 360 .max_sectors = 1024,
361 .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, 361 .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN,
362 .use_clustering = DISABLE_CLUSTERING, 362 .use_clustering = DISABLE_CLUSTERING,
363 .use_sg_chaining = ENABLE_SG_CHAINING,
364}; 363};
365 364
366static int __init 365static int __init
diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 3e35987af458..4f0c30c38e99 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -222,7 +222,8 @@ elf32_set_personality (void)
222} 222}
223 223
224static unsigned long 224static unsigned long
225elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) 225elf32_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt,
226 int prot, int type, unsigned long unused)
226{ 227{
227 unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK; 228 unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
228 229
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 196287928bae..e699eb6c44be 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -947,7 +947,7 @@ percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
947{ 947{
948 unsigned int i; 948 unsigned int i;
949 for_each_possible_cpu(i) { 949 for_each_possible_cpu(i) {
950 memcpy(pcpudst + __per_cpu_offset[i], src, size); 950 memcpy(pcpudst + per_cpu_offset(i), src, size);
951 } 951 }
952} 952}
953#endif /* CONFIG_SMP */ 953#endif /* CONFIG_SMP */
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index ab9a264cb194..f7237c5f531e 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -235,6 +235,11 @@ config IRAM_SIZE
235# Define implied options from the CPU selection here 235# Define implied options from the CPU selection here
236# 236#
237 237
238config GENERIC_LOCKBREAK
239 bool
240 default y
241 depends on SMP && PREEMPT
242
238config RWSEM_GENERIC_SPINLOCK 243config RWSEM_GENERIC_SPINLOCK
239 bool 244 bool
240 depends on M32R 245 depends on M32R
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6b0f85f02c79..4fad0a34b997 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -694,6 +694,11 @@ source "arch/mips/vr41xx/Kconfig"
694 694
695endmenu 695endmenu
696 696
697config GENERIC_LOCKBREAK
698 bool
699 default y
700 depends on SMP && PREEMPT
701
697config RWSEM_GENERIC_SPINLOCK 702config RWSEM_GENERIC_SPINLOCK
698 bool 703 bool
699 default y 704 default y
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index c2d497ceffdd..fc4aa07b6d35 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -24,9 +24,7 @@ DEFINE_SPINLOCK(i8253_lock);
24static void init_pit_timer(enum clock_event_mode mode, 24static void init_pit_timer(enum clock_event_mode mode,
25 struct clock_event_device *evt) 25 struct clock_event_device *evt)
26{ 26{
27 unsigned long flags; 27 spin_lock(&i8253_lock);
28
29 spin_lock_irqsave(&i8253_lock, flags);
30 28
31 switch(mode) { 29 switch(mode) {
32 case CLOCK_EVT_MODE_PERIODIC: 30 case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +53,7 @@ static void init_pit_timer(enum clock_event_mode mode,
55 /* Nothing to do here */ 53 /* Nothing to do here */
56 break; 54 break;
57 } 55 }
58 spin_unlock_irqrestore(&i8253_lock, flags); 56 spin_unlock(&i8253_lock);
59} 57}
60 58
61/* 59/*
@@ -65,12 +63,10 @@ static void init_pit_timer(enum clock_event_mode mode,
65 */ 63 */
66static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 64static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
67{ 65{
68 unsigned long flags; 66 spin_lock(&i8253_lock);
69
70 spin_lock_irqsave(&i8253_lock, flags);
71 outb_p(delta & 0xff , PIT_CH0); /* LSB */ 67 outb_p(delta & 0xff , PIT_CH0); /* LSB */
72 outb(delta >> 8 , PIT_CH0); /* MSB */ 68 outb(delta >> 8 , PIT_CH0); /* MSB */
73 spin_unlock_irqrestore(&i8253_lock, flags); 69 spin_unlock(&i8253_lock);
74 70
75 return 0; 71 return 0;
76} 72}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b8ef1787a191..2b649c46631c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -19,6 +19,11 @@ config MMU
19config STACK_GROWSUP 19config STACK_GROWSUP
20 def_bool y 20 def_bool y
21 21
22config GENERIC_LOCKBREAK
23 bool
24 default y
25 depends on SMP && PREEMPT
26
22config RWSEM_GENERIC_SPINLOCK 27config RWSEM_GENERIC_SPINLOCK
23 def_bool y 28 def_bool y
24 29
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 232c298c933f..fb85f6b72fcf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -42,6 +42,9 @@ config GENERIC_HARDIRQS
42 bool 42 bool
43 default y 43 default y
44 44
45config ARCH_SETS_UP_PER_CPU_AREA
46 def_bool PPC64
47
45config IRQ_PER_CPU 48config IRQ_PER_CPU
46 bool 49 bool
47 default y 50 default y
@@ -53,6 +56,11 @@ config RWSEM_XCHGADD_ALGORITHM
53 bool 56 bool
54 default y 57 default y
55 58
59config GENERIC_LOCKBREAK
60 bool
61 default y
62 depends on SMP && PREEMPT
63
56config ARCH_HAS_ILOG2_U32 64config ARCH_HAS_ILOG2_U32
57 bool 65 bool
58 default y 66 default y
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 3e17d154d0d4..8b056d2295cc 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -256,7 +256,7 @@ static int set_evrregs(struct task_struct *task, unsigned long *data)
256#endif /* CONFIG_SPE */ 256#endif /* CONFIG_SPE */
257 257
258 258
259static void set_single_step(struct task_struct *task) 259void user_enable_single_step(struct task_struct *task)
260{ 260{
261 struct pt_regs *regs = task->thread.regs; 261 struct pt_regs *regs = task->thread.regs;
262 262
@@ -271,7 +271,7 @@ static void set_single_step(struct task_struct *task)
271 set_tsk_thread_flag(task, TIF_SINGLESTEP); 271 set_tsk_thread_flag(task, TIF_SINGLESTEP);
272} 272}
273 273
274static void clear_single_step(struct task_struct *task) 274void user_disable_single_step(struct task_struct *task)
275{ 275{
276 struct pt_regs *regs = task->thread.regs; 276 struct pt_regs *regs = task->thread.regs;
277 277
@@ -313,7 +313,7 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
313void ptrace_disable(struct task_struct *child) 313void ptrace_disable(struct task_struct *child)
314{ 314{
315 /* make sure the single step bit is not set. */ 315 /* make sure the single step bit is not set. */
316 clear_single_step(child); 316 user_disable_single_step(child);
317} 317}
318 318
319/* 319/*
@@ -445,52 +445,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
445 break; 445 break;
446 } 446 }
447 447
448 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
449 case PTRACE_CONT: { /* restart after signal. */
450 ret = -EIO;
451 if (!valid_signal(data))
452 break;
453 if (request == PTRACE_SYSCALL)
454 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
455 else
456 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
457 child->exit_code = data;
458 /* make sure the single step bit is not set. */
459 clear_single_step(child);
460 wake_up_process(child);
461 ret = 0;
462 break;
463 }
464
465/*
466 * make the child exit. Best I can do is send it a sigkill.
467 * perhaps it should be put in the status that it wants to
468 * exit.
469 */
470 case PTRACE_KILL: {
471 ret = 0;
472 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
473 break;
474 child->exit_code = SIGKILL;
475 /* make sure the single step bit is not set. */
476 clear_single_step(child);
477 wake_up_process(child);
478 break;
479 }
480
481 case PTRACE_SINGLESTEP: { /* set the trap flag. */
482 ret = -EIO;
483 if (!valid_signal(data))
484 break;
485 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
486 set_single_step(child);
487 child->exit_code = data;
488 /* give it a chance to run. */
489 wake_up_process(child);
490 ret = 0;
491 break;
492 }
493
494 case PTRACE_GET_DEBUGREG: { 448 case PTRACE_GET_DEBUGREG: {
495 ret = -EINVAL; 449 ret = -EINVAL;
496 /* We only support one DABR and no IABRS at the moment */ 450 /* We only support one DABR and no IABRS at the moment */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 19a5656001c0..f0bad7070fb5 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -37,8 +37,6 @@
37#include <asm/iseries/hv_call_xm.h> 37#include <asm/iseries/hv_call_xm.h>
38#include <asm/iseries/iommu.h> 38#include <asm/iseries/iommu.h>
39 39
40extern struct kset devices_subsys; /* needed for vio_find_name() */
41
42static struct bus_type vio_bus_type; 40static struct bus_type vio_bus_type;
43 41
44static struct vio_dev vio_bus_device = { /* fake "parent" device */ 42static struct vio_dev vio_bus_device = { /* fake "parent" device */
@@ -361,19 +359,16 @@ EXPORT_SYMBOL(vio_get_attribute);
361#ifdef CONFIG_PPC_PSERIES 359#ifdef CONFIG_PPC_PSERIES
362/* vio_find_name() - internal because only vio.c knows how we formatted the 360/* vio_find_name() - internal because only vio.c knows how we formatted the
363 * kobject name 361 * kobject name
364 * XXX once vio_bus_type.devices is actually used as a kset in
365 * drivers/base/bus.c, this function should be removed in favor of
366 * "device_find(kobj_name, &vio_bus_type)"
367 */ 362 */
368static struct vio_dev *vio_find_name(const char *kobj_name) 363static struct vio_dev *vio_find_name(const char *name)
369{ 364{
370 struct kobject *found; 365 struct device *found;
371 366
372 found = kset_find_obj(&devices_subsys, kobj_name); 367 found = bus_find_device_by_name(&vio_bus_type, NULL, name);
373 if (!found) 368 if (!found)
374 return NULL; 369 return NULL;
375 370
376 return to_vio_dev(container_of(found, struct device, kobj)); 371 return to_vio_dev(found);
377} 372}
378 373
379/** 374/**
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 10b212a1f9f5..26f5791baa33 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -66,6 +66,9 @@ config AUDIT_ARCH
66 bool 66 bool
67 default y 67 default y
68 68
69config ARCH_SETS_UP_PER_CPU_AREA
70 def_bool y
71
69config ARCH_NO_VIRT_TO_BUS 72config ARCH_NO_VIRT_TO_BUS
70 def_bool y 73 def_bool y
71 74
@@ -200,6 +203,11 @@ config US2E_FREQ
200 If in doubt, say N. 203 If in doubt, say N.
201 204
202# Global things across all Sun machines. 205# Global things across all Sun machines.
206config GENERIC_LOCKBREAK
207 bool
208 default y
209 depends on SMP && PREEMPT
210
203config RWSEM_GENERIC_SPINLOCK 211config RWSEM_GENERIC_SPINLOCK
204 bool 212 bool
205 213
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index 1b388b41d95d..7c7142ba3bd7 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -71,10 +71,10 @@ EXPORT_SYMBOL(dump_thread);
71 71
72/* required for SMP */ 72/* required for SMP */
73 73
74extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); 74extern void __write_lock_failed(rwlock_t *rw);
75EXPORT_SYMBOL(__write_lock_failed); 75EXPORT_SYMBOL(__write_lock_failed);
76 76
77extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); 77extern void __read_lock_failed(rwlock_t *rw);
78EXPORT_SYMBOL(__read_lock_failed); 78EXPORT_SYMBOL(__read_lock_failed);
79 79
80#endif 80#endif
diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c
index 0147227ce18d..19053d46cb60 100644
--- a/arch/um/sys-i386/signal.c
+++ b/arch/um/sys-i386/signal.c
@@ -3,10 +3,10 @@
3 * Licensed under the GPL 3 * Licensed under the GPL
4 */ 4 */
5 5
6#include "linux/ptrace.h" 6#include <linux/ptrace.h>
7#include "asm/unistd.h" 7#include <asm/unistd.h>
8#include "asm/uaccess.h" 8#include <asm/uaccess.h>
9#include "asm/ucontext.h" 9#include <asm/ucontext.h>
10#include "frame_kern.h" 10#include "frame_kern.h"
11#include "skas.h" 11#include "skas.h"
12 12
@@ -18,17 +18,17 @@ void copy_sc(struct uml_pt_regs *regs, void *from)
18 REGS_FS(regs->gp) = sc->fs; 18 REGS_FS(regs->gp) = sc->fs;
19 REGS_ES(regs->gp) = sc->es; 19 REGS_ES(regs->gp) = sc->es;
20 REGS_DS(regs->gp) = sc->ds; 20 REGS_DS(regs->gp) = sc->ds;
21 REGS_EDI(regs->gp) = sc->edi; 21 REGS_EDI(regs->gp) = sc->di;
22 REGS_ESI(regs->gp) = sc->esi; 22 REGS_ESI(regs->gp) = sc->si;
23 REGS_EBP(regs->gp) = sc->ebp; 23 REGS_EBP(regs->gp) = sc->bp;
24 REGS_SP(regs->gp) = sc->esp; 24 REGS_SP(regs->gp) = sc->sp;
25 REGS_EBX(regs->gp) = sc->ebx; 25 REGS_EBX(regs->gp) = sc->bx;
26 REGS_EDX(regs->gp) = sc->edx; 26 REGS_EDX(regs->gp) = sc->dx;
27 REGS_ECX(regs->gp) = sc->ecx; 27 REGS_ECX(regs->gp) = sc->cx;
28 REGS_EAX(regs->gp) = sc->eax; 28 REGS_EAX(regs->gp) = sc->ax;
29 REGS_IP(regs->gp) = sc->eip; 29 REGS_IP(regs->gp) = sc->ip;
30 REGS_CS(regs->gp) = sc->cs; 30 REGS_CS(regs->gp) = sc->cs;
31 REGS_EFLAGS(regs->gp) = sc->eflags; 31 REGS_EFLAGS(regs->gp) = sc->flags;
32 REGS_SS(regs->gp) = sc->ss; 32 REGS_SS(regs->gp) = sc->ss;
33} 33}
34 34
@@ -229,18 +229,18 @@ static int copy_sc_to_user(struct sigcontext __user *to,
229 sc.fs = REGS_FS(regs->regs.gp); 229 sc.fs = REGS_FS(regs->regs.gp);
230 sc.es = REGS_ES(regs->regs.gp); 230 sc.es = REGS_ES(regs->regs.gp);
231 sc.ds = REGS_DS(regs->regs.gp); 231 sc.ds = REGS_DS(regs->regs.gp);
232 sc.edi = REGS_EDI(regs->regs.gp); 232 sc.di = REGS_EDI(regs->regs.gp);
233 sc.esi = REGS_ESI(regs->regs.gp); 233 sc.si = REGS_ESI(regs->regs.gp);
234 sc.ebp = REGS_EBP(regs->regs.gp); 234 sc.bp = REGS_EBP(regs->regs.gp);
235 sc.esp = sp; 235 sc.sp = sp;
236 sc.ebx = REGS_EBX(regs->regs.gp); 236 sc.bx = REGS_EBX(regs->regs.gp);
237 sc.edx = REGS_EDX(regs->regs.gp); 237 sc.dx = REGS_EDX(regs->regs.gp);
238 sc.ecx = REGS_ECX(regs->regs.gp); 238 sc.cx = REGS_ECX(regs->regs.gp);
239 sc.eax = REGS_EAX(regs->regs.gp); 239 sc.ax = REGS_EAX(regs->regs.gp);
240 sc.eip = REGS_IP(regs->regs.gp); 240 sc.ip = REGS_IP(regs->regs.gp);
241 sc.cs = REGS_CS(regs->regs.gp); 241 sc.cs = REGS_CS(regs->regs.gp);
242 sc.eflags = REGS_EFLAGS(regs->regs.gp); 242 sc.flags = REGS_EFLAGS(regs->regs.gp);
243 sc.esp_at_signal = regs->regs.gp[UESP]; 243 sc.sp_at_signal = regs->regs.gp[UESP];
244 sc.ss = regs->regs.gp[SS]; 244 sc.ss = regs->regs.gp[SS];
245 sc.cr2 = fi->cr2; 245 sc.cr2 = fi->cr2;
246 sc.err = fi->error_code; 246 sc.err = fi->error_code;
diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c
index 1778d33808f4..7457436b433a 100644
--- a/arch/um/sys-x86_64/signal.c
+++ b/arch/um/sys-x86_64/signal.c
@@ -4,11 +4,11 @@
4 * Licensed under the GPL 4 * Licensed under the GPL
5 */ 5 */
6 6
7#include "linux/personality.h" 7#include <linux/personality.h>
8#include "linux/ptrace.h" 8#include <linux/ptrace.h>
9#include "asm/unistd.h" 9#include <asm/unistd.h>
10#include "asm/uaccess.h" 10#include <asm/uaccess.h>
11#include "asm/ucontext.h" 11#include <asm/ucontext.h>
12#include "frame_kern.h" 12#include "frame_kern.h"
13#include "skas.h" 13#include "skas.h"
14 14
@@ -27,16 +27,16 @@ void copy_sc(struct uml_pt_regs *regs, void *from)
27 GETREG(regs, R13, sc, r13); 27 GETREG(regs, R13, sc, r13);
28 GETREG(regs, R14, sc, r14); 28 GETREG(regs, R14, sc, r14);
29 GETREG(regs, R15, sc, r15); 29 GETREG(regs, R15, sc, r15);
30 GETREG(regs, RDI, sc, rdi); 30 GETREG(regs, RDI, sc, di);
31 GETREG(regs, RSI, sc, rsi); 31 GETREG(regs, RSI, sc, si);
32 GETREG(regs, RBP, sc, rbp); 32 GETREG(regs, RBP, sc, bp);
33 GETREG(regs, RBX, sc, rbx); 33 GETREG(regs, RBX, sc, bx);
34 GETREG(regs, RDX, sc, rdx); 34 GETREG(regs, RDX, sc, dx);
35 GETREG(regs, RAX, sc, rax); 35 GETREG(regs, RAX, sc, ax);
36 GETREG(regs, RCX, sc, rcx); 36 GETREG(regs, RCX, sc, cx);
37 GETREG(regs, RSP, sc, rsp); 37 GETREG(regs, RSP, sc, sp);
38 GETREG(regs, RIP, sc, rip); 38 GETREG(regs, RIP, sc, ip);
39 GETREG(regs, EFLAGS, sc, eflags); 39 GETREG(regs, EFLAGS, sc, flags);
40 GETREG(regs, CS, sc, cs); 40 GETREG(regs, CS, sc, cs);
41 41
42#undef GETREG 42#undef GETREG
@@ -61,16 +61,16 @@ static int copy_sc_from_user(struct pt_regs *regs,
61 err |= GETREG(regs, R13, from, r13); 61 err |= GETREG(regs, R13, from, r13);
62 err |= GETREG(regs, R14, from, r14); 62 err |= GETREG(regs, R14, from, r14);
63 err |= GETREG(regs, R15, from, r15); 63 err |= GETREG(regs, R15, from, r15);
64 err |= GETREG(regs, RDI, from, rdi); 64 err |= GETREG(regs, RDI, from, di);
65 err |= GETREG(regs, RSI, from, rsi); 65 err |= GETREG(regs, RSI, from, si);
66 err |= GETREG(regs, RBP, from, rbp); 66 err |= GETREG(regs, RBP, from, bp);
67 err |= GETREG(regs, RBX, from, rbx); 67 err |= GETREG(regs, RBX, from, bx);
68 err |= GETREG(regs, RDX, from, rdx); 68 err |= GETREG(regs, RDX, from, dx);
69 err |= GETREG(regs, RAX, from, rax); 69 err |= GETREG(regs, RAX, from, ax);
70 err |= GETREG(regs, RCX, from, rcx); 70 err |= GETREG(regs, RCX, from, cx);
71 err |= GETREG(regs, RSP, from, rsp); 71 err |= GETREG(regs, RSP, from, sp);
72 err |= GETREG(regs, RIP, from, rip); 72 err |= GETREG(regs, RIP, from, ip);
73 err |= GETREG(regs, EFLAGS, from, eflags); 73 err |= GETREG(regs, EFLAGS, from, flags);
74 err |= GETREG(regs, CS, from, cs); 74 err |= GETREG(regs, CS, from, cs);
75 if (err) 75 if (err)
76 return 1; 76 return 1;
@@ -108,19 +108,19 @@ static int copy_sc_to_user(struct sigcontext __user *to,
108 __put_user((regs)->regs.gp[(regno) / sizeof(unsigned long)], \ 108 __put_user((regs)->regs.gp[(regno) / sizeof(unsigned long)], \
109 &(sc)->regname) 109 &(sc)->regname)
110 110
111 err |= PUTREG(regs, RDI, to, rdi); 111 err |= PUTREG(regs, RDI, to, di);
112 err |= PUTREG(regs, RSI, to, rsi); 112 err |= PUTREG(regs, RSI, to, si);
113 err |= PUTREG(regs, RBP, to, rbp); 113 err |= PUTREG(regs, RBP, to, bp);
114 /* 114 /*
115 * Must use orignal RSP, which is passed in, rather than what's in 115 * Must use orignal RSP, which is passed in, rather than what's in
116 * the pt_regs, because that's already been updated to point at the 116 * the pt_regs, because that's already been updated to point at the
117 * signal frame. 117 * signal frame.
118 */ 118 */
119 err |= __put_user(sp, &to->rsp); 119 err |= __put_user(sp, &to->sp);
120 err |= PUTREG(regs, RBX, to, rbx); 120 err |= PUTREG(regs, RBX, to, bx);
121 err |= PUTREG(regs, RDX, to, rdx); 121 err |= PUTREG(regs, RDX, to, dx);
122 err |= PUTREG(regs, RCX, to, rcx); 122 err |= PUTREG(regs, RCX, to, cx);
123 err |= PUTREG(regs, RAX, to, rax); 123 err |= PUTREG(regs, RAX, to, ax);
124 err |= PUTREG(regs, R8, to, r8); 124 err |= PUTREG(regs, R8, to, r8);
125 err |= PUTREG(regs, R9, to, r9); 125 err |= PUTREG(regs, R9, to, r9);
126 err |= PUTREG(regs, R10, to, r10); 126 err |= PUTREG(regs, R10, to, r10);
@@ -135,8 +135,8 @@ static int copy_sc_to_user(struct sigcontext __user *to,
135 err |= __put_user(fi->error_code, &to->err); 135 err |= __put_user(fi->error_code, &to->err);
136 err |= __put_user(fi->trap_no, &to->trapno); 136 err |= __put_user(fi->trap_no, &to->trapno);
137 137
138 err |= PUTREG(regs, RIP, to, rip); 138 err |= PUTREG(regs, RIP, to, ip);
139 err |= PUTREG(regs, EFLAGS, to, eflags); 139 err |= PUTREG(regs, EFLAGS, to, flags);
140#undef PUTREG 140#undef PUTREG
141 141
142 err |= __put_user(mask, &to->oldmask); 142 err |= __put_user(mask, &to->oldmask);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80b7ba4056db..65b449134cf7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,81 +17,69 @@ config X86_64
17 17
18### Arch settings 18### Arch settings
19config X86 19config X86
20 bool 20 def_bool y
21 default y 21
22config GENERIC_LOCKBREAK
23 def_bool n
22 24
23config GENERIC_TIME 25config GENERIC_TIME
24 bool 26 def_bool y
25 default y
26 27
27config GENERIC_CMOS_UPDATE 28config GENERIC_CMOS_UPDATE
28 bool 29 def_bool y
29 default y
30 30
31config CLOCKSOURCE_WATCHDOG 31config CLOCKSOURCE_WATCHDOG
32 bool 32 def_bool y
33 default y
34 33
35config GENERIC_CLOCKEVENTS 34config GENERIC_CLOCKEVENTS
36 bool 35 def_bool y
37 default y
38 36
39config GENERIC_CLOCKEVENTS_BROADCAST 37config GENERIC_CLOCKEVENTS_BROADCAST
40 bool 38 def_bool y
41 default y
42 depends on X86_64 || (X86_32 && X86_LOCAL_APIC) 39 depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
43 40
44config LOCKDEP_SUPPORT 41config LOCKDEP_SUPPORT
45 bool 42 def_bool y
46 default y
47 43
48config STACKTRACE_SUPPORT 44config STACKTRACE_SUPPORT
49 bool 45 def_bool y
50 default y
51 46
52config SEMAPHORE_SLEEPERS 47config SEMAPHORE_SLEEPERS
53 bool 48 def_bool y
54 default y
55 49
56config MMU 50config MMU
57 bool 51 def_bool y
58 default y
59 52
60config ZONE_DMA 53config ZONE_DMA
61 bool 54 def_bool y
62 default y
63 55
64config QUICKLIST 56config QUICKLIST
65 bool 57 def_bool X86_32
66 default X86_32
67 58
68config SBUS 59config SBUS
69 bool 60 bool
70 61
71config GENERIC_ISA_DMA 62config GENERIC_ISA_DMA
72 bool 63 def_bool y
73 default y
74 64
75config GENERIC_IOMAP 65config GENERIC_IOMAP
76 bool 66 def_bool y
77 default y
78 67
79config GENERIC_BUG 68config GENERIC_BUG
80 bool 69 def_bool y
81 default y
82 depends on BUG 70 depends on BUG
83 71
84config GENERIC_HWEIGHT 72config GENERIC_HWEIGHT
85 bool 73 def_bool y
86 default y 74
75config GENERIC_GPIO
76 def_bool n
87 77
88config ARCH_MAY_HAVE_PC_FDC 78config ARCH_MAY_HAVE_PC_FDC
89 bool 79 def_bool y
90 default y
91 80
92config DMI 81config DMI
93 bool 82 def_bool y
94 default y
95 83
96config RWSEM_GENERIC_SPINLOCK 84config RWSEM_GENERIC_SPINLOCK
97 def_bool !X86_XADD 85 def_bool !X86_XADD
@@ -112,10 +100,14 @@ config GENERIC_TIME_VSYSCALL
112 bool 100 bool
113 default X86_64 101 default X86_64
114 102
103config HAVE_SETUP_PER_CPU_AREA
104 def_bool X86_64
105
115config ARCH_SUPPORTS_OPROFILE 106config ARCH_SUPPORTS_OPROFILE
116 bool 107 bool
117 default y 108 default y
118 109
110select HAVE_KVM
119 111
120config ZONE_DMA32 112config ZONE_DMA32
121 bool 113 bool
@@ -144,9 +136,17 @@ config GENERIC_PENDING_IRQ
144 136
145config X86_SMP 137config X86_SMP
146 bool 138 bool
147 depends on X86_32 && SMP && !X86_VOYAGER 139 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
148 default y 140 default y
149 141
142config X86_32_SMP
143 def_bool y
144 depends on X86_32 && SMP
145
146config X86_64_SMP
147 def_bool y
148 depends on X86_64 && SMP
149
150config X86_HT 150config X86_HT
151 bool 151 bool
152 depends on SMP 152 depends on SMP
@@ -292,6 +292,18 @@ config X86_ES7000
292 Only choose this option if you have such a system, otherwise you 292 Only choose this option if you have such a system, otherwise you
293 should say N here. 293 should say N here.
294 294
295config X86_RDC321X
296 bool "RDC R-321x SoC"
297 depends on X86_32
298 select M486
299 select X86_REBOOTFIXUPS
300 select GENERIC_GPIO
301 select LEDS_GPIO
302 help
303 This option is needed for RDC R-321x system-on-chip, also known
304 as R-8610-(G).
305 If you don't have one of these chips, you should say N here.
306
295config X86_VSMP 307config X86_VSMP
296 bool "Support for ScaleMP vSMP" 308 bool "Support for ScaleMP vSMP"
297 depends on X86_64 && PCI 309 depends on X86_64 && PCI
@@ -303,8 +315,8 @@ config X86_VSMP
303endchoice 315endchoice
304 316
305config SCHED_NO_NO_OMIT_FRAME_POINTER 317config SCHED_NO_NO_OMIT_FRAME_POINTER
306 bool "Single-depth WCHAN output" 318 def_bool y
307 default y 319 prompt "Single-depth WCHAN output"
308 depends on X86_32 320 depends on X86_32
309 help 321 help
310 Calculate simpler /proc/<PID>/wchan values. If this option 322 Calculate simpler /proc/<PID>/wchan values. If this option
@@ -314,18 +326,8 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
314 326
315 If in doubt, say "Y". 327 If in doubt, say "Y".
316 328
317config PARAVIRT
318 bool
319 depends on X86_32 && !(X86_VISWS || X86_VOYAGER)
320 help
321 This changes the kernel so it can modify itself when it is run
322 under a hypervisor, potentially improving performance significantly
323 over full virtualization. However, when run without a hypervisor
324 the kernel is theoretically slower and slightly larger.
325
326menuconfig PARAVIRT_GUEST 329menuconfig PARAVIRT_GUEST
327 bool "Paravirtualized guest support" 330 bool "Paravirtualized guest support"
328 depends on X86_32
329 help 331 help
330 Say Y here to get to see options related to running Linux under 332 Say Y here to get to see options related to running Linux under
331 various hypervisors. This option alone does not add any kernel code. 333 various hypervisors. This option alone does not add any kernel code.
@@ -339,6 +341,7 @@ source "arch/x86/xen/Kconfig"
339config VMI 341config VMI
340 bool "VMI Guest support" 342 bool "VMI Guest support"
341 select PARAVIRT 343 select PARAVIRT
344 depends on X86_32
342 depends on !(X86_VISWS || X86_VOYAGER) 345 depends on !(X86_VISWS || X86_VOYAGER)
343 help 346 help
344 VMI provides a paravirtualized interface to the VMware ESX server 347 VMI provides a paravirtualized interface to the VMware ESX server
@@ -348,40 +351,43 @@ config VMI
348 351
349source "arch/x86/lguest/Kconfig" 352source "arch/x86/lguest/Kconfig"
350 353
354config PARAVIRT
355 bool "Enable paravirtualization code"
356 depends on !(X86_VISWS || X86_VOYAGER)
357 help
358 This changes the kernel so it can modify itself when it is run
359 under a hypervisor, potentially improving performance significantly
360 over full virtualization. However, when run without a hypervisor
361 the kernel is theoretically slower and slightly larger.
362
351endif 363endif
352 364
353config ACPI_SRAT 365config ACPI_SRAT
354 bool 366 def_bool y
355 default y
356 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) 367 depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
357 select ACPI_NUMA 368 select ACPI_NUMA
358 369
359config HAVE_ARCH_PARSE_SRAT 370config HAVE_ARCH_PARSE_SRAT
360 bool 371 def_bool y
361 default y 372 depends on ACPI_SRAT
362 depends on ACPI_SRAT
363 373
364config X86_SUMMIT_NUMA 374config X86_SUMMIT_NUMA
365 bool 375 def_bool y
366 default y
367 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) 376 depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
368 377
369config X86_CYCLONE_TIMER 378config X86_CYCLONE_TIMER
370 bool 379 def_bool y
371 default y
372 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH 380 depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
373 381
374config ES7000_CLUSTERED_APIC 382config ES7000_CLUSTERED_APIC
375 bool 383 def_bool y
376 default y
377 depends on SMP && X86_ES7000 && MPENTIUMIII 384 depends on SMP && X86_ES7000 && MPENTIUMIII
378 385
379source "arch/x86/Kconfig.cpu" 386source "arch/x86/Kconfig.cpu"
380 387
381config HPET_TIMER 388config HPET_TIMER
382 bool 389 def_bool X86_64
383 prompt "HPET Timer Support" if X86_32 390 prompt "HPET Timer Support" if X86_32
384 default X86_64
385 help 391 help
386 Use the IA-PC HPET (High Precision Event Timer) to manage 392 Use the IA-PC HPET (High Precision Event Timer) to manage
387 time in preference to the PIT and RTC, if a HPET is 393 time in preference to the PIT and RTC, if a HPET is
@@ -399,9 +405,8 @@ config HPET_TIMER
399 Choose N to continue using the legacy 8254 timer. 405 Choose N to continue using the legacy 8254 timer.
400 406
401config HPET_EMULATE_RTC 407config HPET_EMULATE_RTC
402 bool 408 def_bool y
403 depends on HPET_TIMER && RTC=y 409 depends on HPET_TIMER && (RTC=y || RTC=m)
404 default y
405 410
406# Mark as embedded because too many people got it wrong. 411# Mark as embedded because too many people got it wrong.
407# The code disables itself when not needed. 412# The code disables itself when not needed.
@@ -441,8 +446,8 @@ config CALGARY_IOMMU
441 If unsure, say Y. 446 If unsure, say Y.
442 447
443config CALGARY_IOMMU_ENABLED_BY_DEFAULT 448config CALGARY_IOMMU_ENABLED_BY_DEFAULT
444 bool "Should Calgary be enabled by default?" 449 def_bool y
445 default y 450 prompt "Should Calgary be enabled by default?"
446 depends on CALGARY_IOMMU 451 depends on CALGARY_IOMMU
447 help 452 help
448 Should Calgary be enabled by default? if you choose 'y', Calgary 453 Should Calgary be enabled by default? if you choose 'y', Calgary
@@ -486,9 +491,9 @@ config SCHED_SMT
486 N here. 491 N here.
487 492
488config SCHED_MC 493config SCHED_MC
489 bool "Multi-core scheduler support" 494 def_bool y
495 prompt "Multi-core scheduler support"
490 depends on (X86_64 && SMP) || (X86_32 && X86_HT) 496 depends on (X86_64 && SMP) || (X86_32 && X86_HT)
491 default y
492 help 497 help
493 Multi-core scheduler support improves the CPU scheduler's decision 498 Multi-core scheduler support improves the CPU scheduler's decision
494 making when dealing with multi-core CPU chips at a cost of slightly 499 making when dealing with multi-core CPU chips at a cost of slightly
@@ -522,19 +527,16 @@ config X86_UP_IOAPIC
522 an IO-APIC, then the kernel will still run with no slowdown at all. 527 an IO-APIC, then the kernel will still run with no slowdown at all.
523 528
524config X86_LOCAL_APIC 529config X86_LOCAL_APIC
525 bool 530 def_bool y
526 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) 531 depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
527 default y
528 532
529config X86_IO_APIC 533config X86_IO_APIC
530 bool 534 def_bool y
531 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) 535 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
532 default y
533 536
534config X86_VISWS_APIC 537config X86_VISWS_APIC
535 bool 538 def_bool y
536 depends on X86_32 && X86_VISWS 539 depends on X86_32 && X86_VISWS
537 default y
538 540
539config X86_MCE 541config X86_MCE
540 bool "Machine Check Exception" 542 bool "Machine Check Exception"
@@ -554,17 +556,17 @@ config X86_MCE
554 the 386 and 486, so nearly everyone can say Y here. 556 the 386 and 486, so nearly everyone can say Y here.
555 557
556config X86_MCE_INTEL 558config X86_MCE_INTEL
557 bool "Intel MCE features" 559 def_bool y
560 prompt "Intel MCE features"
558 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 561 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
559 default y
560 help 562 help
561 Additional support for intel specific MCE features such as 563 Additional support for intel specific MCE features such as
562 the thermal monitor. 564 the thermal monitor.
563 565
564config X86_MCE_AMD 566config X86_MCE_AMD
565 bool "AMD MCE features" 567 def_bool y
568 prompt "AMD MCE features"
566 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 569 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
567 default y
568 help 570 help
569 Additional support for AMD specific MCE features such as 571 Additional support for AMD specific MCE features such as
570 the DRAM Error Threshold. 572 the DRAM Error Threshold.
@@ -637,9 +639,9 @@ config I8K
637 Say N otherwise. 639 Say N otherwise.
638 640
639config X86_REBOOTFIXUPS 641config X86_REBOOTFIXUPS
640 bool "Enable X86 board specific fixups for reboot" 642 def_bool n
643 prompt "Enable X86 board specific fixups for reboot"
641 depends on X86_32 && X86 644 depends on X86_32 && X86
642 default n
643 ---help--- 645 ---help---
644 This enables chipset and/or board specific fixups to be done 646 This enables chipset and/or board specific fixups to be done
645 in order to get reboot to work correctly. This is only needed on 647 in order to get reboot to work correctly. This is only needed on
@@ -648,7 +650,7 @@ config X86_REBOOTFIXUPS
648 system. 650 system.
649 651
650 Currently, the only fixup is for the Geode machines using 652 Currently, the only fixup is for the Geode machines using
651 CS5530A and CS5536 chipsets. 653 CS5530A and CS5536 chipsets and the RDC R-321x SoC.
652 654
653 Say Y if you want to enable the fixup. Currently, it's safe to 655 Say Y if you want to enable the fixup. Currently, it's safe to
654 enable this option even if you don't need it. 656 enable this option even if you don't need it.
@@ -672,9 +674,8 @@ config MICROCODE
672 module will be called microcode. 674 module will be called microcode.
673 675
674config MICROCODE_OLD_INTERFACE 676config MICROCODE_OLD_INTERFACE
675 bool 677 def_bool y
676 depends on MICROCODE 678 depends on MICROCODE
677 default y
678 679
679config X86_MSR 680config X86_MSR
680 tristate "/dev/cpu/*/msr - Model-specific register support" 681 tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -798,13 +799,12 @@ config PAGE_OFFSET
798 depends on X86_32 799 depends on X86_32
799 800
800config HIGHMEM 801config HIGHMEM
801 bool 802 def_bool y
802 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) 803 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
803 default y
804 804
805config X86_PAE 805config X86_PAE
806 bool "PAE (Physical Address Extension) Support" 806 def_bool n
807 default n 807 prompt "PAE (Physical Address Extension) Support"
808 depends on X86_32 && !HIGHMEM4G 808 depends on X86_32 && !HIGHMEM4G
809 select RESOURCES_64BIT 809 select RESOURCES_64BIT
810 help 810 help
@@ -836,10 +836,10 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
836 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 836 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
837 837
838config K8_NUMA 838config K8_NUMA
839 bool "Old style AMD Opteron NUMA detection" 839 def_bool y
840 depends on X86_64 && NUMA && PCI 840 prompt "Old style AMD Opteron NUMA detection"
841 default y 841 depends on X86_64 && NUMA && PCI
842 help 842 help
843 Enable K8 NUMA node topology detection. You should say Y here if 843 Enable K8 NUMA node topology detection. You should say Y here if
844 you have a multi processor AMD K8 system. This uses an old 844 you have a multi processor AMD K8 system. This uses an old
845 method to read the NUMA configuration directly from the builtin 845 method to read the NUMA configuration directly from the builtin
@@ -847,10 +847,10 @@ config K8_NUMA
847 instead, which also takes priority if both are compiled in. 847 instead, which also takes priority if both are compiled in.
848 848
849config X86_64_ACPI_NUMA 849config X86_64_ACPI_NUMA
850 bool "ACPI NUMA detection" 850 def_bool y
851 prompt "ACPI NUMA detection"
851 depends on X86_64 && NUMA && ACPI && PCI 852 depends on X86_64 && NUMA && ACPI && PCI
852 select ACPI_NUMA 853 select ACPI_NUMA
853 default y
854 help 854 help
855 Enable ACPI SRAT based node topology detection. 855 Enable ACPI SRAT based node topology detection.
856 856
@@ -864,52 +864,53 @@ config NUMA_EMU
864 864
865config NODES_SHIFT 865config NODES_SHIFT
866 int 866 int
867 range 1 15 if X86_64
867 default "6" if X86_64 868 default "6" if X86_64
868 default "4" if X86_NUMAQ 869 default "4" if X86_NUMAQ
869 default "3" 870 default "3"
870 depends on NEED_MULTIPLE_NODES 871 depends on NEED_MULTIPLE_NODES
871 872
872config HAVE_ARCH_BOOTMEM_NODE 873config HAVE_ARCH_BOOTMEM_NODE
873 bool 874 def_bool y
874 depends on X86_32 && NUMA 875 depends on X86_32 && NUMA
875 default y
876 876
877config ARCH_HAVE_MEMORY_PRESENT 877config ARCH_HAVE_MEMORY_PRESENT
878 bool 878 def_bool y
879 depends on X86_32 && DISCONTIGMEM 879 depends on X86_32 && DISCONTIGMEM
880 default y
881 880
882config NEED_NODE_MEMMAP_SIZE 881config NEED_NODE_MEMMAP_SIZE
883 bool 882 def_bool y
884 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 883 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
885 default y
886 884
887config HAVE_ARCH_ALLOC_REMAP 885config HAVE_ARCH_ALLOC_REMAP
888 bool 886 def_bool y
889 depends on X86_32 && NUMA 887 depends on X86_32 && NUMA
890 default y
891 888
892config ARCH_FLATMEM_ENABLE 889config ARCH_FLATMEM_ENABLE
893 def_bool y 890 def_bool y
894 depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA) 891 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
895 892
896config ARCH_DISCONTIGMEM_ENABLE 893config ARCH_DISCONTIGMEM_ENABLE
897 def_bool y 894 def_bool y
898 depends on NUMA 895 depends on NUMA && X86_32
899 896
900config ARCH_DISCONTIGMEM_DEFAULT 897config ARCH_DISCONTIGMEM_DEFAULT
901 def_bool y 898 def_bool y
902 depends on NUMA 899 depends on NUMA && X86_32
900
901config ARCH_SPARSEMEM_DEFAULT
902 def_bool y
903 depends on X86_64
903 904
904config ARCH_SPARSEMEM_ENABLE 905config ARCH_SPARSEMEM_ENABLE
905 def_bool y 906 def_bool y
906 depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64)) 907 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
907 select SPARSEMEM_STATIC if X86_32 908 select SPARSEMEM_STATIC if X86_32
908 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 909 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
909 910
910config ARCH_SELECT_MEMORY_MODEL 911config ARCH_SELECT_MEMORY_MODEL
911 def_bool y 912 def_bool y
912 depends on X86_32 && ARCH_SPARSEMEM_ENABLE 913 depends on ARCH_SPARSEMEM_ENABLE
913 914
914config ARCH_MEMORY_PROBE 915config ARCH_MEMORY_PROBE
915 def_bool X86_64 916 def_bool X86_64
@@ -987,42 +988,32 @@ config MTRR
987 See <file:Documentation/mtrr.txt> for more information. 988 See <file:Documentation/mtrr.txt> for more information.
988 989
989config EFI 990config EFI
990 bool "Boot from EFI support" 991 def_bool n
991 depends on X86_32 && ACPI 992 prompt "EFI runtime service support"
992 default n 993 depends on ACPI
993 ---help--- 994 ---help---
994 This enables the kernel to boot on EFI platforms using 995 This enables the kernel to use EFI runtime services that are
995 system configuration information passed to it from the firmware.
996 This also enables the kernel to use any EFI runtime services that are
997 available (such as the EFI variable services). 996 available (such as the EFI variable services).
998 997
999 This option is only useful on systems that have EFI firmware 998 This option is only useful on systems that have EFI firmware.
1000 and will result in a kernel image that is ~8k larger. In addition, 999 In addition, you should use the latest ELILO loader available
1001 you must use the latest ELILO loader available at 1000 at <http://elilo.sourceforge.net> in order to take advantage
1002 <http://elilo.sourceforge.net> in order to take advantage of 1001 of EFI runtime services. However, even with this option, the
1003 kernel initialization using EFI information (neither GRUB nor LILO know 1002 resultant kernel should continue to boot on existing non-EFI
1004 anything about EFI). However, even with this option, the resultant 1003 platforms.
1005 kernel should continue to boot on existing non-EFI platforms.
1006 1004
1007config IRQBALANCE 1005config IRQBALANCE
1008 bool "Enable kernel irq balancing" 1006 def_bool y
1007 prompt "Enable kernel irq balancing"
1009 depends on X86_32 && SMP && X86_IO_APIC 1008 depends on X86_32 && SMP && X86_IO_APIC
1010 default y
1011 help 1009 help
1012 The default yes will allow the kernel to do irq load balancing. 1010 The default yes will allow the kernel to do irq load balancing.
1013 Saying no will keep the kernel from doing irq load balancing. 1011 Saying no will keep the kernel from doing irq load balancing.
1014 1012
1015# turning this on wastes a bunch of space.
1016# Summit needs it only when NUMA is on
1017config BOOT_IOREMAP
1018 bool
1019 depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
1020 default y
1021
1022config SECCOMP 1013config SECCOMP
1023 bool "Enable seccomp to safely compute untrusted bytecode" 1014 def_bool y
1015 prompt "Enable seccomp to safely compute untrusted bytecode"
1024 depends on PROC_FS 1016 depends on PROC_FS
1025 default y
1026 help 1017 help
1027 This kernel feature is useful for number crunching applications 1018 This kernel feature is useful for number crunching applications
1028 that may need to compute untrusted bytecode during their 1019 that may need to compute untrusted bytecode during their
@@ -1189,11 +1180,11 @@ config HOTPLUG_CPU
1189 suspend. 1180 suspend.
1190 1181
1191config COMPAT_VDSO 1182config COMPAT_VDSO
1192 bool "Compat VDSO support" 1183 def_bool y
1193 default y 1184 prompt "Compat VDSO support"
1194 depends on X86_32 1185 depends on X86_32 || IA32_EMULATION
1195 help 1186 help
1196 Map the VDSO to the predictable old-style address too. 1187 Map the 32-bit VDSO to the predictable old-style address too.
1197 ---help--- 1188 ---help---
1198 Say N here if you are running a sufficiently recent glibc 1189 Say N here if you are running a sufficiently recent glibc
1199 version (2.3.3 or later), to remove the high-mapped 1190 version (2.3.3 or later), to remove the high-mapped
@@ -1207,30 +1198,26 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
1207 def_bool y 1198 def_bool y
1208 depends on X86_64 || (X86_32 && HIGHMEM) 1199 depends on X86_64 || (X86_32 && HIGHMEM)
1209 1200
1210config MEMORY_HOTPLUG_RESERVE
1211 def_bool X86_64
1212 depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
1213
1214config HAVE_ARCH_EARLY_PFN_TO_NID 1201config HAVE_ARCH_EARLY_PFN_TO_NID
1215 def_bool X86_64 1202 def_bool X86_64
1216 depends on NUMA 1203 depends on NUMA
1217 1204
1218config OUT_OF_LINE_PFN_TO_PAGE
1219 def_bool X86_64
1220 depends on DISCONTIGMEM
1221
1222menu "Power management options" 1205menu "Power management options"
1223 depends on !X86_VOYAGER 1206 depends on !X86_VOYAGER
1224 1207
1225config ARCH_HIBERNATION_HEADER 1208config ARCH_HIBERNATION_HEADER
1226 bool 1209 def_bool y
1227 depends on X86_64 && HIBERNATION 1210 depends on X86_64 && HIBERNATION
1228 default y
1229 1211
1230source "kernel/power/Kconfig" 1212source "kernel/power/Kconfig"
1231 1213
1232source "drivers/acpi/Kconfig" 1214source "drivers/acpi/Kconfig"
1233 1215
1216config X86_APM_BOOT
1217 bool
1218 default y
1219 depends on APM || APM_MODULE
1220
1234menuconfig APM 1221menuconfig APM
1235 tristate "APM (Advanced Power Management) BIOS support" 1222 tristate "APM (Advanced Power Management) BIOS support"
1236 depends on X86_32 && PM_SLEEP && !X86_VISWS 1223 depends on X86_32 && PM_SLEEP && !X86_VISWS
@@ -1371,7 +1358,7 @@ menu "Bus options (PCI etc.)"
1371config PCI 1358config PCI
1372 bool "PCI support" if !X86_VISWS 1359 bool "PCI support" if !X86_VISWS
1373 depends on !X86_VOYAGER 1360 depends on !X86_VOYAGER
1374 default y if X86_VISWS 1361 default y
1375 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) 1362 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1376 help 1363 help
1377 Find out whether you have a PCI motherboard. PCI is the name of a 1364 Find out whether you have a PCI motherboard. PCI is the name of a
@@ -1418,25 +1405,21 @@ config PCI_GOANY
1418endchoice 1405endchoice
1419 1406
1420config PCI_BIOS 1407config PCI_BIOS
1421 bool 1408 def_bool y
1422 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) 1409 depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
1423 default y
1424 1410
1425# x86-64 doesn't support PCI BIOS access from long mode so always go direct. 1411# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
1426config PCI_DIRECT 1412config PCI_DIRECT
1427 bool 1413 def_bool y
1428 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) 1414 depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
1429 default y
1430 1415
1431config PCI_MMCONFIG 1416config PCI_MMCONFIG
1432 bool 1417 def_bool y
1433 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) 1418 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
1434 default y
1435 1419
1436config PCI_DOMAINS 1420config PCI_DOMAINS
1437 bool 1421 def_bool y
1438 depends on PCI 1422 depends on PCI
1439 default y
1440 1423
1441config PCI_MMCONFIG 1424config PCI_MMCONFIG
1442 bool "Support mmconfig PCI config space access" 1425 bool "Support mmconfig PCI config space access"
@@ -1453,9 +1436,9 @@ config DMAR
1453 remapping devices. 1436 remapping devices.
1454 1437
1455config DMAR_GFX_WA 1438config DMAR_GFX_WA
1456 bool "Support for Graphics workaround" 1439 def_bool y
1440 prompt "Support for Graphics workaround"
1457 depends on DMAR 1441 depends on DMAR
1458 default y
1459 help 1442 help
1460 Current Graphics drivers tend to use physical address 1443 Current Graphics drivers tend to use physical address
1461 for DMA and avoid using DMA APIs. Setting this config 1444 for DMA and avoid using DMA APIs. Setting this config
@@ -1464,9 +1447,8 @@ config DMAR_GFX_WA
1464 to use physical addresses for DMA. 1447 to use physical addresses for DMA.
1465 1448
1466config DMAR_FLOPPY_WA 1449config DMAR_FLOPPY_WA
1467 bool 1450 def_bool y
1468 depends on DMAR 1451 depends on DMAR
1469 default y
1470 help 1452 help
1471 Floppy disk drivers are know to bypass DMA API calls 1453 Floppy disk drivers are know to bypass DMA API calls
1472 thereby failing to work when IOMMU is enabled. This 1454 thereby failing to work when IOMMU is enabled. This
@@ -1479,8 +1461,7 @@ source "drivers/pci/Kconfig"
1479 1461
1480# x86_64 have no ISA slots, but do have ISA-style DMA. 1462# x86_64 have no ISA slots, but do have ISA-style DMA.
1481config ISA_DMA_API 1463config ISA_DMA_API
1482 bool 1464 def_bool y
1483 default y
1484 1465
1485if X86_32 1466if X86_32
1486 1467
@@ -1546,9 +1527,9 @@ config SCx200HR_TIMER
1546 other workaround is idle=poll boot option. 1527 other workaround is idle=poll boot option.
1547 1528
1548config GEODE_MFGPT_TIMER 1529config GEODE_MFGPT_TIMER
1549 bool "Geode Multi-Function General Purpose Timer (MFGPT) events" 1530 def_bool y
1531 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
1550 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS 1532 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
1551 default y
1552 help 1533 help
1553 This driver provides a clock event source based on the MFGPT 1534 This driver provides a clock event source based on the MFGPT
1554 timer(s) in the CS5535 and CS5536 companion chip for the geode. 1535 timer(s) in the CS5535 and CS5536 companion chip for the geode.
@@ -1575,6 +1556,7 @@ source "fs/Kconfig.binfmt"
1575config IA32_EMULATION 1556config IA32_EMULATION
1576 bool "IA32 Emulation" 1557 bool "IA32 Emulation"
1577 depends on X86_64 1558 depends on X86_64
1559 select COMPAT_BINFMT_ELF
1578 help 1560 help
1579 Include code to run 32-bit programs under a 64-bit kernel. You should 1561 Include code to run 32-bit programs under a 64-bit kernel. You should
1580 likely turn this on, unless you're 100% sure that you don't have any 1562 likely turn this on, unless you're 100% sure that you don't have any
@@ -1587,18 +1569,16 @@ config IA32_AOUT
1587 Support old a.out binaries in the 32bit emulation. 1569 Support old a.out binaries in the 32bit emulation.
1588 1570
1589config COMPAT 1571config COMPAT
1590 bool 1572 def_bool y
1591 depends on IA32_EMULATION 1573 depends on IA32_EMULATION
1592 default y
1593 1574
1594config COMPAT_FOR_U64_ALIGNMENT 1575config COMPAT_FOR_U64_ALIGNMENT
1595 def_bool COMPAT 1576 def_bool COMPAT
1596 depends on X86_64 1577 depends on X86_64
1597 1578
1598config SYSVIPC_COMPAT 1579config SYSVIPC_COMPAT
1599 bool 1580 def_bool y
1600 depends on X86_64 && COMPAT && SYSVIPC 1581 depends on X86_64 && COMPAT && SYSVIPC
1601 default y
1602 1582
1603endmenu 1583endmenu
1604 1584
@@ -1619,4 +1599,6 @@ source "security/Kconfig"
1619 1599
1620source "crypto/Kconfig" 1600source "crypto/Kconfig"
1621 1601
1602source "arch/x86/kvm/Kconfig"
1603
1622source "lib/Kconfig" 1604source "lib/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c30162202dc4..e09a6b73a1aa 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -219,10 +219,10 @@ config MGEODEGX1
219 Select this for a Geode GX1 (Cyrix MediaGX) chip. 219 Select this for a Geode GX1 (Cyrix MediaGX) chip.
220 220
221config MGEODE_LX 221config MGEODE_LX
222 bool "Geode GX/LX" 222 bool "Geode GX/LX"
223 depends on X86_32 223 depends on X86_32
224 help 224 help
225 Select this for AMD Geode GX and LX processors. 225 Select this for AMD Geode GX and LX processors.
226 226
227config MCYRIXIII 227config MCYRIXIII
228 bool "CyrixIII/VIA-C3" 228 bool "CyrixIII/VIA-C3"
@@ -258,7 +258,7 @@ config MPSC
258 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey 258 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
259 Xeon CPUs with Intel 64bit which is compatible with x86-64. 259 Xeon CPUs with Intel 64bit which is compatible with x86-64.
260 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the 260 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
261 Netburst core and shouldn't use this option. You can distinguish them 261 Netburst core and shouldn't use this option. You can distinguish them
262 using the cpu family field 262 using the cpu family field
263 in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. 263 in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
264 264
@@ -317,81 +317,75 @@ config X86_L1_CACHE_SHIFT
317 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 317 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
318 318
319config X86_XADD 319config X86_XADD
320 bool 320 def_bool y
321 depends on X86_32 && !M386 321 depends on X86_32 && !M386
322 default y
323 322
324config X86_PPRO_FENCE 323config X86_PPRO_FENCE
325 bool 324 bool "PentiumPro memory ordering errata workaround"
326 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 325 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
327 default y 326 help
327 Old PentiumPro multiprocessor systems had errata that could cause memory
328 operations to violate the x86 ordering standard in rare cases. Enabling this
329 option will attempt to work around some (but not all) occurances of
330 this problem, at the cost of much heavier spinlock and memory barrier
331 operations.
332
333 If unsure, say n here. Even distro kernels should think twice before enabling
334 this: there are few systems, and an unlikely bug.
328 335
329config X86_F00F_BUG 336config X86_F00F_BUG
330 bool 337 def_bool y
331 depends on M586MMX || M586TSC || M586 || M486 || M386 338 depends on M586MMX || M586TSC || M586 || M486 || M386
332 default y
333 339
334config X86_WP_WORKS_OK 340config X86_WP_WORKS_OK
335 bool 341 def_bool y
336 depends on X86_32 && !M386 342 depends on X86_32 && !M386
337 default y
338 343
339config X86_INVLPG 344config X86_INVLPG
340 bool 345 def_bool y
341 depends on X86_32 && !M386 346 depends on X86_32 && !M386
342 default y
343 347
344config X86_BSWAP 348config X86_BSWAP
345 bool 349 def_bool y
346 depends on X86_32 && !M386 350 depends on X86_32 && !M386
347 default y
348 351
349config X86_POPAD_OK 352config X86_POPAD_OK
350 bool 353 def_bool y
351 depends on X86_32 && !M386 354 depends on X86_32 && !M386
352 default y
353 355
354config X86_ALIGNMENT_16 356config X86_ALIGNMENT_16
355 bool 357 def_bool y
356 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 358 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
357 default y
358 359
359config X86_GOOD_APIC 360config X86_GOOD_APIC
360 bool 361 def_bool y
361 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 362 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
362 default y
363 363
364config X86_INTEL_USERCOPY 364config X86_INTEL_USERCOPY
365 bool 365 def_bool y
366 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 366 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
367 default y
368 367
369config X86_USE_PPRO_CHECKSUM 368config X86_USE_PPRO_CHECKSUM
370 bool 369 def_bool y
371 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 370 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
372 default y
373 371
374config X86_USE_3DNOW 372config X86_USE_3DNOW
375 bool 373 def_bool y
376 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML 374 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
377 default y
378 375
379config X86_OOSTORE 376config X86_OOSTORE
380 bool 377 def_bool y
381 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 378 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
382 default y
383 379
384config X86_TSC 380config X86_TSC
385 bool 381 def_bool y
386 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 382 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
387 default y
388 383
389# this should be set for all -march=.. options where the compiler 384# this should be set for all -march=.. options where the compiler
390# generates cmov. 385# generates cmov.
391config X86_CMOV 386config X86_CMOV
392 bool 387 def_bool y
393 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) 388 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
394 default y
395 389
396config X86_MINIMUM_CPU_FAMILY 390config X86_MINIMUM_CPU_FAMILY
397 int 391 int
@@ -399,3 +393,6 @@ config X86_MINIMUM_CPU_FAMILY
399 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) 393 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
400 default "3" 394 default "3"
401 395
396config X86_DEBUGCTLMSR
397 def_bool y
398 depends on !(M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 761ca7b5f120..2e1e3af28c3a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config EARLY_PRINTK 8config EARLY_PRINTK
9 bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32 9 bool "Early printk" if EMBEDDED
10 default y 10 default y
11 help 11 help
12 Write kernel log output directly into the VGA buffer or to a serial 12 Write kernel log output directly into the VGA buffer or to a serial
@@ -40,22 +40,49 @@ comment "Page alloc debug is incompatible with Software Suspend on i386"
40 40
41config DEBUG_PAGEALLOC 41config DEBUG_PAGEALLOC
42 bool "Debug page memory allocations" 42 bool "Debug page memory allocations"
43 depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS 43 depends on DEBUG_KERNEL && X86_32
44 depends on X86_32
45 help 44 help
46 Unmap pages from the kernel linear mapping after free_pages(). 45 Unmap pages from the kernel linear mapping after free_pages().
47 This results in a large slowdown, but helps to find certain types 46 This results in a large slowdown, but helps to find certain types
48 of memory corruptions. 47 of memory corruptions.
49 48
49config DEBUG_PER_CPU_MAPS
50 bool "Debug access to per_cpu maps"
51 depends on DEBUG_KERNEL
52 depends on X86_64_SMP
53 default n
54 help
55 Say Y to verify that the per_cpu map being accessed has
56 been setup. Adds a fair amount of code to kernel memory
57 and decreases performance.
58
59 Say N if unsure.
60
50config DEBUG_RODATA 61config DEBUG_RODATA
51 bool "Write protect kernel read-only data structures" 62 bool "Write protect kernel read-only data structures"
63 default y
52 depends on DEBUG_KERNEL 64 depends on DEBUG_KERNEL
53 help 65 help
54 Mark the kernel read-only data as write-protected in the pagetables, 66 Mark the kernel read-only data as write-protected in the pagetables,
55 in order to catch accidental (and incorrect) writes to such const 67 in order to catch accidental (and incorrect) writes to such const
56 data. This option may have a slight performance impact because a 68 data. This is recommended so that we can catch kernel bugs sooner.
57 portion of the kernel code won't be covered by a 2MB TLB anymore. 69 If in doubt, say "Y".
58 If in doubt, say "N". 70
71config DEBUG_RODATA_TEST
72 bool "Testcase for the DEBUG_RODATA feature"
73 depends on DEBUG_RODATA
74 help
75 This option enables a testcase for the DEBUG_RODATA
76 feature as well as for the change_page_attr() infrastructure.
77 If in doubt, say "N"
78
79config DEBUG_NX_TEST
80 tristate "Testcase for the NX non-executable stack feature"
81 depends on DEBUG_KERNEL && m
82 help
83 This option enables a testcase for the CPU NX capability
84 and the software setup of this feature.
85 If in doubt, say "N"
59 86
60config 4KSTACKS 87config 4KSTACKS
61 bool "Use 4Kb for kernel stacks instead of 8Kb" 88 bool "Use 4Kb for kernel stacks instead of 8Kb"
@@ -75,8 +102,7 @@ config X86_FIND_SMP_CONFIG
75 102
76config X86_MPPARSE 103config X86_MPPARSE
77 def_bool y 104 def_bool y
78 depends on X86_LOCAL_APIC && !X86_VISWS 105 depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
79 depends on X86_32
80 106
81config DOUBLEFAULT 107config DOUBLEFAULT
82 default y 108 default y
@@ -112,4 +138,91 @@ config IOMMU_LEAK
112 Add a simple leak tracer to the IOMMU code. This is useful when you 138 Add a simple leak tracer to the IOMMU code. This is useful when you
113 are debugging a buggy device driver that leaks IOMMU mappings. 139 are debugging a buggy device driver that leaks IOMMU mappings.
114 140
141#
142# IO delay types:
143#
144
145config IO_DELAY_TYPE_0X80
146 int
147 default "0"
148
149config IO_DELAY_TYPE_0XED
150 int
151 default "1"
152
153config IO_DELAY_TYPE_UDELAY
154 int
155 default "2"
156
157config IO_DELAY_TYPE_NONE
158 int
159 default "3"
160
161choice
162 prompt "IO delay type"
163 default IO_DELAY_0XED
164
165config IO_DELAY_0X80
166 bool "port 0x80 based port-IO delay [recommended]"
167 help
168 This is the traditional Linux IO delay used for in/out_p.
169 It is the most tested hence safest selection here.
170
171config IO_DELAY_0XED
172 bool "port 0xed based port-IO delay"
173 help
174 Use port 0xed as the IO delay. This frees up port 0x80 which is
175 often used as a hardware-debug port.
176
177config IO_DELAY_UDELAY
178 bool "udelay based port-IO delay"
179 help
180 Use udelay(2) as the IO delay method. This provides the delay
181 while not having any side-effect on the IO port space.
182
183config IO_DELAY_NONE
184 bool "no port-IO delay"
185 help
186 No port-IO delay. Will break on old boxes that require port-IO
187 delay for certain operations. Should work on most new machines.
188
189endchoice
190
191if IO_DELAY_0X80
192config DEFAULT_IO_DELAY_TYPE
193 int
194 default IO_DELAY_TYPE_0X80
195endif
196
197if IO_DELAY_0XED
198config DEFAULT_IO_DELAY_TYPE
199 int
200 default IO_DELAY_TYPE_0XED
201endif
202
203if IO_DELAY_UDELAY
204config DEFAULT_IO_DELAY_TYPE
205 int
206 default IO_DELAY_TYPE_UDELAY
207endif
208
209if IO_DELAY_NONE
210config DEFAULT_IO_DELAY_TYPE
211 int
212 default IO_DELAY_TYPE_NONE
213endif
214
215config DEBUG_BOOT_PARAMS
216 bool "Debug boot parameters"
217 depends on DEBUG_KERNEL
218 depends on DEBUG_FS
219 help
220 This option will cause struct boot_params to be exported via debugfs.
221
222config CPA_DEBUG
223 bool "CPA self test code"
224 depends on DEBUG_KERNEL
225 help
226 Do change_page_attr self tests at boot.
227
115endmenu 228endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7aa1dc6d67c8..da8f4129780b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,13 +7,254 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10# No need to remake these files 10core-$(CONFIG_KVM) += arch/x86/kvm/
11$(srctree)/arch/x86/Makefile%: ; 11
12# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o
15export BITS
12 16
13ifeq ($(CONFIG_X86_32),y) 17ifeq ($(CONFIG_X86_32),y)
18 BITS := 32
14 UTS_MACHINE := i386 19 UTS_MACHINE := i386
15 include $(srctree)/arch/x86/Makefile_32 20 CHECKFLAGS += -D__i386__
21
22 biarch := $(call cc-option,-m32)
23 KBUILD_AFLAGS += $(biarch)
24 KBUILD_CFLAGS += $(biarch)
25
26 ifdef CONFIG_RELOCATABLE
27 LDFLAGS_vmlinux := --emit-relocs
28 endif
29
30 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
31
32 # prevent gcc from keeping the stack 16 byte aligned
33 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
34
35 # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
36 # a lot more stack due to the lack of sharing of stacklots:
37 KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
38 echo $(call cc-option,-fno-unit-at-a-time); fi ;)
39
40 # CPU-specific tuning. Anything which can be shared with UML should go here.
41 include $(srctree)/arch/x86/Makefile_32.cpu
42 KBUILD_CFLAGS += $(cflags-y)
43
44 # temporary until string.h is fixed
45 KBUILD_CFLAGS += -ffreestanding
16else 46else
47 BITS := 64
17 UTS_MACHINE := x86_64 48 UTS_MACHINE := x86_64
18 include $(srctree)/arch/x86/Makefile_64 49 CHECKFLAGS += -D__x86_64__ -m64
50
51 KBUILD_AFLAGS += -m64
52 KBUILD_CFLAGS += -m64
53
54 # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
55 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
56 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
57
58 cflags-$(CONFIG_MCORE2) += \
59 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
60 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
61 KBUILD_CFLAGS += $(cflags-y)
62
63 KBUILD_CFLAGS += -mno-red-zone
64 KBUILD_CFLAGS += -mcmodel=kernel
65
66 # -funit-at-a-time shrinks the kernel .text considerably
67 # unfortunately it makes reading oopses harder.
68 KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
69
70 # this works around some issues with generating unwind tables in older gccs
71 # newer gccs do it by default
72 KBUILD_CFLAGS += -maccumulate-outgoing-args
73
74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
76 "$(CC)" -fstack-protector )
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
78 "$(CC)" -fstack-protector-all )
79
80 KBUILD_CFLAGS += $(stackp-y)
19endif 81endif
82
83# Stackpointer is addressed different for 32 bit and 64 bit x86
84sp-$(CONFIG_X86_32) := esp
85sp-$(CONFIG_X86_64) := rsp
86
87# do binutils support CFI?
88cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
89# is .cfi_signal_frame supported too?
90cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
91KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
92KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
93
94LDFLAGS := -m elf_$(UTS_MACHINE)
95OBJCOPYFLAGS := -O binary -R .note -R .comment -S
96
97# Speed up the build
98KBUILD_CFLAGS += -pipe
99# Workaround for a gcc prelease that unfortunately was shipped in a suse release
100KBUILD_CFLAGS += -Wno-sign-compare
101#
102KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
103# prevent gcc from generating any FP code by mistake
104KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
105
106###
107# Sub architecture support
108# fcore-y is linked before mcore-y files.
109
110# Default subarch .c files
111mcore-y := arch/x86/mach-default/
112
113# Voyager subarch support
114mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
115mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
116
117# VISWS subarch support
118mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
119mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/
120
121# NUMAQ subarch support
122mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
123mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/
124
125# BIGSMP subarch support
126mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
127mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/
128
129#Summit subarch support
130mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
131mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/
132
133# generic subarchitecture
134mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
135fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
136mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
137
138
139# ES7000 subarch support
140mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
141fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
142mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/
143
144# RDC R-321x subarch support
145mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
146mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default
147core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
148
149# default subarch .h files
150mflags-y += -Iinclude/asm-x86/mach-default
151
152# 64 bit does not support subarch support - clear sub arch variables
153fcore-$(CONFIG_X86_64) :=
154mcore-$(CONFIG_X86_64) :=
155mflags-$(CONFIG_X86_64) :=
156
157KBUILD_CFLAGS += $(mflags-y)
158KBUILD_AFLAGS += $(mflags-y)
159
160###
161# Kernel objects
162
163head-y := arch/x86/kernel/head_$(BITS).o
164head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o
165head-y += arch/x86/kernel/init_task.o
166
167libs-y += arch/x86/lib/
168
169# Sub architecture files that needs linking first
170core-y += $(fcore-y)
171
172# Xen paravirtualization support
173core-$(CONFIG_XEN) += arch/x86/xen/
174
175# lguest paravirtualization support
176core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
177
178core-y += arch/x86/kernel/
179core-y += arch/x86/mm/
180
181# Remaining sub architecture files
182core-y += $(mcore-y)
183
184core-y += arch/x86/crypto/
185core-y += arch/x86/vdso/
186core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
187
188# drivers-y are linked after core-y
189drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
190drivers-$(CONFIG_PCI) += arch/x86/pci/
191
192# must be linked after kernel/
193drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
194
195ifeq ($(CONFIG_X86_32),y)
196drivers-$(CONFIG_PM) += arch/x86/power/
197drivers-$(CONFIG_FB) += arch/x86/video/
198endif
199
200####
201# boot loader support. Several targets are kept for legacy purposes
202
203boot := arch/x86/boot
204
205PHONY += zImage bzImage compressed zlilo bzlilo \
206 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
207
208# Default kernel to build
209all: bzImage
210
211# KBUILD_IMAGE specify target image being built
212 KBUILD_IMAGE := $(boot)/bzImage
213zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
214
215zImage bzImage: vmlinux
216 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
217 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
218 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage
219
220compressed: zImage
221
222zlilo bzlilo: vmlinux
223 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
224
225zdisk bzdisk: vmlinux
226 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
227
228fdimage fdimage144 fdimage288 isoimage: vmlinux
229 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
230
231install: vdso_install
232 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
233
234PHONY += vdso_install
235vdso_install:
236 $(Q)$(MAKE) $(build)=arch/x86/vdso $@
237
238archclean:
239 $(Q)rm -rf $(objtree)/arch/i386
240 $(Q)rm -rf $(objtree)/arch/x86_64
241 $(Q)$(MAKE) $(clean)=$(boot)
242
243define archhelp
244 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
245 echo ' install - Install kernel using'
246 echo ' (your) ~/bin/installkernel or'
247 echo ' (distribution) /sbin/installkernel or'
248 echo ' install to $$(INSTALL_PATH) and run lilo'
249 echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
250 echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
251 echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
252 echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
253 echo ' bzdisk/fdimage*/isoimage also accept:'
254 echo ' FDARGS="..." arguments for the booted kernel'
255 echo ' FDINITRD=file initrd for the booted kernel'
256endef
257
258CLEAN_FILES += arch/x86/boot/fdimage \
259 arch/x86/boot/image.iso \
260 arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32
deleted file mode 100644
index 50394da2f6c1..000000000000
--- a/arch/x86/Makefile_32
+++ /dev/null
@@ -1,175 +0,0 @@
1#
2# i386 Makefile
3#
4# This file is included by the global makefile so that you can add your own
5# architecture-specific flags and dependencies. Remember to do have actions
6# for "archclean" cleaning up for this architecture.
7#
8# This file is subject to the terms and conditions of the GNU General Public
9# License. See the file "COPYING" in the main directory of this archive
10# for more details.
11#
12# Copyright (C) 1994 by Linus Torvalds
13#
14# 19990713 Artur Skawina <skawina@geocities.com>
15# Added '-march' and '-mpreferred-stack-boundary' support
16#
17# 20050320 Kianusch Sayah Karadji <kianusch@sk-tech.net>
18# Added support for GEODE CPU
19
20# BITS is used as extension for files which are available in a 32 bit
21# and a 64 bit version to simplify shared Makefiles.
22# e.g.: obj-y += foo_$(BITS).o
23BITS := 32
24export BITS
25
26HAS_BIARCH := $(call cc-option-yn, -m32)
27ifeq ($(HAS_BIARCH),y)
28AS := $(AS) --32
29LD := $(LD) -m elf_i386
30CC := $(CC) -m32
31endif
32
33LDFLAGS := -m elf_i386
34OBJCOPYFLAGS := -O binary -R .note -R .comment -S
35ifdef CONFIG_RELOCATABLE
36LDFLAGS_vmlinux := --emit-relocs
37endif
38CHECKFLAGS += -D__i386__
39
40KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
41
42# prevent gcc from keeping the stack 16 byte aligned
43KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
44
45# CPU-specific tuning. Anything which can be shared with UML should go here.
46include $(srctree)/arch/x86/Makefile_32.cpu
47
48# temporary until string.h is fixed
49cflags-y += -ffreestanding
50
51# this works around some issues with generating unwind tables in older gccs
52# newer gccs do it by default
53cflags-y += -maccumulate-outgoing-args
54
55# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
56# a lot more stack due to the lack of sharing of stacklots:
57KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;)
58
59# do binutils support CFI?
60cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
61KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
62
63# is .cfi_signal_frame supported too?
64cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
65KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
66
67KBUILD_CFLAGS += $(cflags-y)
68
69# Default subarch .c files
70mcore-y := arch/x86/mach-default
71
72# Voyager subarch support
73mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
74mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager
75
76# VISWS subarch support
77mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws
78mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws
79
80# NUMAQ subarch support
81mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq
82mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default
83
84# BIGSMP subarch support
85mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp
86mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default
87
88#Summit subarch support
89mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
90mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default
91
92# generic subarchitecture
93mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-x86/mach-generic
94mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default
95core-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
96
97# ES7000 subarch support
98mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000
99mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default
100core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
101
102# Xen paravirtualization support
103core-$(CONFIG_XEN) += arch/x86/xen/
104
105# lguest paravirtualization support
106core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
107
108# default subarch .h files
109mflags-y += -Iinclude/asm-x86/mach-default
110
111head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task.o
112
113libs-y += arch/x86/lib/
114core-y += arch/x86/kernel/ \
115 arch/x86/mm/ \
116 $(mcore-y)/ \
117 arch/x86/crypto/
118drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
119drivers-$(CONFIG_PCI) += arch/x86/pci/
120# must be linked after kernel/
121drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
122drivers-$(CONFIG_PM) += arch/x86/power/
123drivers-$(CONFIG_FB) += arch/x86/video/
124
125KBUILD_CFLAGS += $(mflags-y)
126KBUILD_AFLAGS += $(mflags-y)
127
128boot := arch/x86/boot
129
130PHONY += zImage bzImage compressed zlilo bzlilo \
131 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
132
133all: bzImage
134
135# KBUILD_IMAGE specify target image being built
136 KBUILD_IMAGE := $(boot)/bzImage
137zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
138
139zImage bzImage: vmlinux
140 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
141 $(Q)mkdir -p $(objtree)/arch/i386/boot
142 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/i386/boot/bzImage
143
144compressed: zImage
145
146zlilo bzlilo: vmlinux
147 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
148
149zdisk bzdisk: vmlinux
150 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
151
152fdimage fdimage144 fdimage288 isoimage: vmlinux
153 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
154
155install:
156 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
157
158archclean:
159 $(Q)rm -rf $(objtree)/arch/i386/boot
160 $(Q)$(MAKE) $(clean)=arch/x86/boot
161
162define archhelp
163 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
164 echo ' install - Install kernel using'
165 echo ' (your) ~/bin/installkernel or'
166 echo ' (distribution) /sbin/installkernel or'
167 echo ' install to $$(INSTALL_PATH) and run lilo'
168 echo ' bzdisk - Create a boot floppy in /dev/fd0'
169 echo ' fdimage - Create a boot floppy image'
170 echo ' isoimage - Create a boot CD-ROM image'
171endef
172
173CLEAN_FILES += arch/x86/boot/fdimage \
174 arch/x86/boot/image.iso \
175 arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_64 b/arch/x86/Makefile_64
deleted file mode 100644
index a804860022e6..000000000000
--- a/arch/x86/Makefile_64
+++ /dev/null
@@ -1,144 +0,0 @@
1#
2# x86_64 Makefile
3#
4# This file is included by the global makefile so that you can add your own
5# architecture-specific flags and dependencies. Remember to do have actions
6# for "archclean" and "archdep" for cleaning up and making dependencies for
7# this architecture
8#
9# This file is subject to the terms and conditions of the GNU General Public
10# License. See the file "COPYING" in the main directory of this archive
11# for more details.
12#
13# Copyright (C) 1994 by Linus Torvalds
14#
15# 19990713 Artur Skawina <skawina@geocities.com>
16# Added '-march' and '-mpreferred-stack-boundary' support
17# 20000913 Pavel Machek <pavel@suse.cz>
18# Converted for x86_64 architecture
19# 20010105 Andi Kleen, add IA32 compiler.
20# ....and later removed it again....
21#
22# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
23
24# BITS is used as extension for files which are available in a 32 bit
25# and a 64 bit version to simplify shared Makefiles.
26# e.g.: obj-y += foo_$(BITS).o
27BITS := 64
28export BITS
29
30LDFLAGS := -m elf_x86_64
31OBJCOPYFLAGS := -O binary -R .note -R .comment -S
32LDFLAGS_vmlinux :=
33CHECKFLAGS += -D__x86_64__ -m64
34
35cflags-y :=
36cflags-kernel-y :=
37cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
38cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
39# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it
40# will eventually. Use -mtune=generic as fallback
41cflags-$(CONFIG_MCORE2) += \
42 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
43cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
44
45cflags-y += -m64
46cflags-y += -mno-red-zone
47cflags-y += -mcmodel=kernel
48cflags-y += -pipe
49cflags-y += -Wno-sign-compare
50cflags-y += -fno-asynchronous-unwind-tables
51ifneq ($(CONFIG_DEBUG_INFO),y)
52# -fweb shrinks the kernel a bit, but the difference is very small
53# it also messes up debugging, so don't use it for now.
54#cflags-y += $(call cc-option,-fweb)
55endif
56# -funit-at-a-time shrinks the kernel .text considerably
57# unfortunately it makes reading oopses harder.
58cflags-y += $(call cc-option,-funit-at-a-time)
59# prevent gcc from generating any FP code by mistake
60cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
61# this works around some issues with generating unwind tables in older gccs
62# newer gccs do it by default
63cflags-y += -maccumulate-outgoing-args
64
65# do binutils support CFI?
66cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
67KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
68
69# is .cfi_signal_frame supported too?
70cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
71KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
72
73cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector )
74cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all )
75
76KBUILD_CFLAGS += $(cflags-y)
77CFLAGS_KERNEL += $(cflags-kernel-y)
78KBUILD_AFLAGS += -m64
79
80head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task.o
81
82libs-y += arch/x86/lib/
83core-y += arch/x86/kernel/ \
84 arch/x86/mm/ \
85 arch/x86/crypto/ \
86 arch/x86/vdso/
87core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
88drivers-$(CONFIG_PCI) += arch/x86/pci/
89drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
90
91boot := arch/x86/boot
92
93PHONY += bzImage bzlilo install archmrproper \
94 fdimage fdimage144 fdimage288 isoimage archclean
95
96#Default target when executing "make"
97all: bzImage
98
99BOOTIMAGE := arch/x86/boot/bzImage
100KBUILD_IMAGE := $(BOOTIMAGE)
101
102bzImage: vmlinux
103 $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
104 $(Q)mkdir -p $(objtree)/arch/x86_64/boot
105 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage
106
107bzlilo: vmlinux
108 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
109
110bzdisk: vmlinux
111 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
112
113fdimage fdimage144 fdimage288 isoimage: vmlinux
114 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
115
116install: vdso_install
117 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
118
119vdso_install:
120ifeq ($(CONFIG_IA32_EMULATION),y)
121 $(Q)$(MAKE) $(build)=arch/x86/ia32 $@
122endif
123 $(Q)$(MAKE) $(build)=arch/x86/vdso $@
124
125archclean:
126 $(Q)rm -rf $(objtree)/arch/x86_64/boot
127 $(Q)$(MAKE) $(clean)=$(boot)
128
129define archhelp
130 echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
131 echo ' install - Install kernel using'
132 echo ' (your) ~/bin/installkernel or'
133 echo ' (distribution) /sbin/installkernel or'
134 echo ' install to $$(INSTALL_PATH) and run lilo'
135 echo ' bzdisk - Create a boot floppy in /dev/fd0'
136 echo ' fdimage - Create a boot floppy image'
137 echo ' isoimage - Create a boot CD-ROM image'
138endef
139
140CLEAN_FILES += arch/x86/boot/fdimage \
141 arch/x86/boot/image.iso \
142 arch/x86/boot/mtools.conf
143
144
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7a3116ccf387..349b81a39c40 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -28,9 +28,11 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage 28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 29subdir- := compressed
30 30
31setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o 31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
33setup-y += printf.o string.o tty.o video.o version.o voyager.o 33setup-y += printf.o string.o tty.o video.o version.o
34setup-$(CONFIG_X86_APM_BOOT) += apm.o
35setup-$(CONFIG_X86_VOYAGER) += voyager.o
34 36
35# The link order of the video-*.o modules can matter. In particular, 37# The link order of the video-*.o modules can matter. In particular,
36# video-vga.o *must* be listed first, followed by video-vesa.o. 38# video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -49,10 +51,7 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE)
49 51
50# How to compile the 16-bit code. Note we always compile for -march=i386, 52# How to compile the 16-bit code. Note we always compile for -march=i386,
51# that way we can complain to the user if the CPU is insufficient. 53# that way we can complain to the user if the CPU is insufficient.
52cflags-$(CONFIG_X86_32) :=
53cflags-$(CONFIG_X86_64) := -m32
54KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ 54KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
55 $(cflags-y) \
56 -Wall -Wstrict-prototypes \ 55 -Wall -Wstrict-prototypes \
57 -march=i386 -mregparm=3 \ 56 -march=i386 -mregparm=3 \
58 -include $(srctree)/$(src)/code16gcc.h \ 57 -include $(srctree)/$(src)/code16gcc.h \
@@ -62,6 +61,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
62 $(call cc-option, -fno-unit-at-a-time)) \ 61 $(call cc-option, -fno-unit-at-a-time)) \
63 $(call cc-option, -fno-stack-protector) \ 62 $(call cc-option, -fno-stack-protector) \
64 $(call cc-option, -mpreferred-stack-boundary=2) 63 $(call cc-option, -mpreferred-stack-boundary=2)
64KBUILD_CFLAGS += $(call cc-option,-m32)
65KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 65KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
66 66
67$(obj)/zImage: IMAGE_OFFSET := 0x1000 67$(obj)/zImage: IMAGE_OFFSET := 0x1000
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index eab50c55a3a5..c117c7fb859c 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -19,8 +19,6 @@
19 19
20#include "boot.h" 20#include "boot.h"
21 21
22#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
23
24int query_apm_bios(void) 22int query_apm_bios(void)
25{ 23{
26 u16 ax, bx, cx, dx, di; 24 u16 ax, bx, cx, dx, di;
@@ -95,4 +93,3 @@ int query_apm_bios(void)
95 return 0; 93 return 0;
96} 94}
97 95
98#endif
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index d2b5adf46512..7822a4983da2 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -109,7 +109,7 @@ typedef unsigned int addr_t;
109static inline u8 rdfs8(addr_t addr) 109static inline u8 rdfs8(addr_t addr)
110{ 110{
111 u8 v; 111 u8 v;
112 asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); 112 asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
113 return v; 113 return v;
114} 114}
115static inline u16 rdfs16(addr_t addr) 115static inline u16 rdfs16(addr_t addr)
@@ -127,21 +127,21 @@ static inline u32 rdfs32(addr_t addr)
127 127
128static inline void wrfs8(u8 v, addr_t addr) 128static inline void wrfs8(u8 v, addr_t addr)
129{ 129{
130 asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v)); 130 asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
131} 131}
132static inline void wrfs16(u16 v, addr_t addr) 132static inline void wrfs16(u16 v, addr_t addr)
133{ 133{
134 asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v)); 134 asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
135} 135}
136static inline void wrfs32(u32 v, addr_t addr) 136static inline void wrfs32(u32 v, addr_t addr)
137{ 137{
138 asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v)); 138 asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
139} 139}
140 140
141static inline u8 rdgs8(addr_t addr) 141static inline u8 rdgs8(addr_t addr)
142{ 142{
143 u8 v; 143 u8 v;
144 asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); 144 asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
145 return v; 145 return v;
146} 146}
147static inline u16 rdgs16(addr_t addr) 147static inline u16 rdgs16(addr_t addr)
@@ -159,15 +159,15 @@ static inline u32 rdgs32(addr_t addr)
159 159
160static inline void wrgs8(u8 v, addr_t addr) 160static inline void wrgs8(u8 v, addr_t addr)
161{ 161{
162 asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v)); 162 asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
163} 163}
164static inline void wrgs16(u16 v, addr_t addr) 164static inline void wrgs16(u16 v, addr_t addr)
165{ 165{
166 asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v)); 166 asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
167} 167}
168static inline void wrgs32(u32 v, addr_t addr) 168static inline void wrgs32(u32 v, addr_t addr)
169{ 169{
170 asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v)); 170 asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
171} 171}
172 172
173/* Note: these only return true/false, not a signed return value! */ 173/* Note: these only return true/false, not a signed return value! */
@@ -241,6 +241,7 @@ int query_apm_bios(void);
241 241
242/* cmdline.c */ 242/* cmdline.c */
243int cmdline_find_option(const char *option, char *buffer, int bufsize); 243int cmdline_find_option(const char *option, char *buffer, int bufsize);
244int cmdline_find_option_bool(const char *option);
244 245
245/* cpu.c, cpucheck.c */ 246/* cpu.c, cpucheck.c */
246int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 247int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 34bb778c4357..680408a0f463 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -95,3 +95,68 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
95 95
96 return len; 96 return len;
97} 97}
98
99/*
100 * Find a boolean option (like quiet,noapic,nosmp....)
101 *
102 * Returns the position of that option (starts counting with 1)
103 * or 0 on not found
104 */
105int cmdline_find_option_bool(const char *option)
106{
107 u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
108 addr_t cptr;
109 char c;
110 int pos = 0, wstart = 0;
111 const char *opptr = NULL;
112 enum {
113 st_wordstart, /* Start of word/after whitespace */
114 st_wordcmp, /* Comparing this word */
115 st_wordskip, /* Miscompare, skip */
116 } state = st_wordstart;
117
118 if (!cmdline_ptr || cmdline_ptr >= 0x100000)
119 return -1; /* No command line, or inaccessible */
120
121 cptr = cmdline_ptr & 0xf;
122 set_fs(cmdline_ptr >> 4);
123
124 while (cptr < 0x10000) {
125 c = rdfs8(cptr++);
126 pos++;
127
128 switch (state) {
129 case st_wordstart:
130 if (!c)
131 return 0;
132 else if (myisspace(c))
133 break;
134
135 state = st_wordcmp;
136 opptr = option;
137 wstart = pos;
138 /* fall through */
139
140 case st_wordcmp:
141 if (!*opptr)
142 if (!c || myisspace(c))
143 return wstart;
144 else
145 state = st_wordskip;
146 else if (!c)
147 return 0;
148 else if (c != *opptr++)
149 state = st_wordskip;
150 break;
151
152 case st_wordskip:
153 if (!c)
154 return 0;
155 else if (myisspace(c))
156 state = st_wordstart;
157 break;
158 }
159 }
160
161 return 0; /* Buffer overrun */
162}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 52c1db854520..fe24ceabd909 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -1,5 +1,63 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o
8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11cflags-$(CONFIG_X86_64) := -mcmodel=small
12KBUILD_CFLAGS += $(cflags-y)
13KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
14KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
15
16KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
17
18LDFLAGS := -m elf_$(UTS_MACHINE)
19LDFLAGS_vmlinux := -T
20
21$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
22 $(call if_changed,ld)
23 @:
24
25$(obj)/vmlinux.bin: vmlinux FORCE
26 $(call if_changed,objcopy)
27
28
1ifeq ($(CONFIG_X86_32),y) 29ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/boot/compressed/Makefile_32 30targets += vmlinux.bin.all vmlinux.relocs
31hostprogs-y := relocs
32
33quiet_cmd_relocs = RELOCS $@
34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
35$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
36 $(call if_changed,relocs)
37
38vmlinux.bin.all-y := $(obj)/vmlinux.bin
39vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
40quiet_cmd_relocbin = BUILD $@
41 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
43 $(call if_changed,relocbin)
44
45ifdef CONFIG_RELOCATABLE
46$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
47 $(call if_changed,gzip)
3else 48else
4include ${srctree}/arch/x86/boot/compressed/Makefile_64 49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
50 $(call if_changed,gzip)
5endif 51endif
52LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
53
54else
55$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
56 $(call if_changed,gzip)
57
58LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
59endif
60
61
62$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
63 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32
deleted file mode 100644
index e43ff7c56e6e..000000000000
--- a/arch/x86/boot/compressed/Makefile_32
+++ /dev/null
@@ -1,50 +0,0 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
9EXTRA_AFLAGS := -traditional
10
11LDFLAGS_vmlinux := -T
12hostprogs-y := relocs
13
14KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
15 -fno-strict-aliasing -fPIC \
16 $(call cc-option,-ffreestanding) \
17 $(call cc-option,-fno-stack-protector)
18LDFLAGS := -m elf_i386
19
20$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE
21 $(call if_changed,ld)
22 @:
23
24$(obj)/vmlinux.bin: vmlinux FORCE
25 $(call if_changed,objcopy)
26
27quiet_cmd_relocs = RELOCS $@
28 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
29$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
30 $(call if_changed,relocs)
31
32vmlinux.bin.all-y := $(obj)/vmlinux.bin
33vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
34quiet_cmd_relocbin = BUILD $@
35 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
36$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
37 $(call if_changed,relocbin)
38
39ifdef CONFIG_RELOCATABLE
40$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
41 $(call if_changed,gzip)
42else
43$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
44 $(call if_changed,gzip)
45endif
46
47LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
48
49$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE
50 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64
deleted file mode 100644
index 7801e8dd90b2..000000000000
--- a/arch/x86/boot/compressed/Makefile_64
+++ /dev/null
@@ -1,30 +0,0 @@
1#
2# linux/arch/x86/boot/compressed/Makefile
3#
4# create a compressed vmlinux image from the original vmlinux
5#
6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o
8
9KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \
10 -fno-strict-aliasing -fPIC -mcmodel=small \
11 $(call cc-option, -ffreestanding) \
12 $(call cc-option, -fno-stack-protector)
13KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
14LDFLAGS := -m elf_x86_64
15
16LDFLAGS_vmlinux := -T
17$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE
18 $(call if_changed,ld)
19 @:
20
21$(obj)/vmlinux.bin: vmlinux FORCE
22 $(call if_changed,objcopy)
23
24$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
25 $(call if_changed,gzip)
26
27LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
28
29$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE
30 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc.c
index b74d60d1b2fa..8182e32c1b42 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * misc.c 2 * misc.c
3 * 3 *
4 * This is a collection of several routines from gzip-1.0.3 4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux. 5 * adapted for Linux.
6 * 6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
@@ -9,9 +9,18 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12/*
13 * we have to be careful, because no indirections are allowed here, and
14 * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
15 * we just keep it from happening
16 */
12#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_64
19#define _LINUX_STRING_H_ 1
20#define __LINUX_BITMAP_H 1
21#endif
22
13#include <linux/linkage.h> 23#include <linux/linkage.h>
14#include <linux/vmalloc.h>
15#include <linux/screen_info.h> 24#include <linux/screen_info.h>
16#include <asm/io.h> 25#include <asm/io.h>
17#include <asm/page.h> 26#include <asm/page.h>
@@ -186,10 +195,20 @@ static void *memcpy(void *dest, const void *src, unsigned n);
186 195
187static void putstr(const char *); 196static void putstr(const char *);
188 197
189static unsigned long free_mem_ptr; 198#ifdef CONFIG_X86_64
190static unsigned long free_mem_end_ptr; 199#define memptr long
200#else
201#define memptr unsigned
202#endif
203
204static memptr free_mem_ptr;
205static memptr free_mem_end_ptr;
191 206
207#ifdef CONFIG_X86_64
208#define HEAP_SIZE 0x7000
209#else
192#define HEAP_SIZE 0x4000 210#define HEAP_SIZE 0x4000
211#endif
193 212
194static char *vidmem = (char *)0xb8000; 213static char *vidmem = (char *)0xb8000;
195static int vidport; 214static int vidport;
@@ -230,7 +249,7 @@ static void gzip_mark(void **ptr)
230 249
231static void gzip_release(void **ptr) 250static void gzip_release(void **ptr)
232{ 251{
233 free_mem_ptr = (unsigned long) *ptr; 252 free_mem_ptr = (memptr) *ptr;
234} 253}
235 254
236static void scroll(void) 255static void scroll(void)
@@ -247,8 +266,10 @@ static void putstr(const char *s)
247 int x,y,pos; 266 int x,y,pos;
248 char c; 267 char c;
249 268
269#ifdef CONFIG_X86_32
250 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) 270 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0)
251 return; 271 return;
272#endif
252 273
253 x = RM_SCREEN_INFO.orig_x; 274 x = RM_SCREEN_INFO.orig_x;
254 y = RM_SCREEN_INFO.orig_y; 275 y = RM_SCREEN_INFO.orig_y;
@@ -261,7 +282,7 @@ static void putstr(const char *s)
261 y--; 282 y--;
262 } 283 }
263 } else { 284 } else {
264 vidmem [ ( x + cols * y ) * 2 ] = c; 285 vidmem [(x + cols * y) * 2] = c;
265 if ( ++x >= cols ) { 286 if ( ++x >= cols ) {
266 x = 0; 287 x = 0;
267 if ( ++y >= lines ) { 288 if ( ++y >= lines ) {
@@ -276,16 +297,16 @@ static void putstr(const char *s)
276 RM_SCREEN_INFO.orig_y = y; 297 RM_SCREEN_INFO.orig_y = y;
277 298
278 pos = (x + cols * y) * 2; /* Update cursor position */ 299 pos = (x + cols * y) * 2; /* Update cursor position */
279 outb_p(14, vidport); 300 outb(14, vidport);
280 outb_p(0xff & (pos >> 9), vidport+1); 301 outb(0xff & (pos >> 9), vidport+1);
281 outb_p(15, vidport); 302 outb(15, vidport);
282 outb_p(0xff & (pos >> 1), vidport+1); 303 outb(0xff & (pos >> 1), vidport+1);
283} 304}
284 305
285static void* memset(void* s, int c, unsigned n) 306static void* memset(void* s, int c, unsigned n)
286{ 307{
287 int i; 308 int i;
288 char *ss = (char*)s; 309 char *ss = s;
289 310
290 for (i=0;i<n;i++) ss[i] = c; 311 for (i=0;i<n;i++) ss[i] = c;
291 return s; 312 return s;
@@ -294,7 +315,8 @@ static void* memset(void* s, int c, unsigned n)
294static void* memcpy(void* dest, const void* src, unsigned n) 315static void* memcpy(void* dest, const void* src, unsigned n)
295{ 316{
296 int i; 317 int i;
297 char *d = (char *)dest, *s = (char *)src; 318 const char *s = src;
319 char *d = dest;
298 320
299 for (i=0;i<n;i++) d[i] = s[i]; 321 for (i=0;i<n;i++) d[i] = s[i];
300 return dest; 322 return dest;
@@ -339,11 +361,13 @@ static void error(char *x)
339 putstr(x); 361 putstr(x);
340 putstr("\n\n -- System halted"); 362 putstr("\n\n -- System halted");
341 363
342 while(1); /* Halt */ 364 while (1)
365 asm("hlt");
343} 366}
344 367
345asmlinkage void decompress_kernel(void *rmode, unsigned long end, 368asmlinkage void decompress_kernel(void *rmode, memptr heap,
346 uch *input_data, unsigned long input_len, uch *output) 369 uch *input_data, unsigned long input_len,
370 uch *output)
347{ 371{
348 real_mode = rmode; 372 real_mode = rmode;
349 373
@@ -358,25 +382,32 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end,
358 lines = RM_SCREEN_INFO.orig_video_lines; 382 lines = RM_SCREEN_INFO.orig_video_lines;
359 cols = RM_SCREEN_INFO.orig_video_cols; 383 cols = RM_SCREEN_INFO.orig_video_cols;
360 384
361 window = output; /* Output buffer (Normally at 1M) */ 385 window = output; /* Output buffer (Normally at 1M) */
362 free_mem_ptr = end; /* Heap */ 386 free_mem_ptr = heap; /* Heap */
363 free_mem_end_ptr = end + HEAP_SIZE; 387 free_mem_end_ptr = heap + HEAP_SIZE;
364 inbuf = input_data; /* Input buffer */ 388 inbuf = input_data; /* Input buffer */
365 insize = input_len; 389 insize = input_len;
366 inptr = 0; 390 inptr = 0;
367 391
392#ifdef CONFIG_X86_64
393 if ((ulg)output & (__KERNEL_ALIGN - 1))
394 error("Destination address not 2M aligned");
395 if ((ulg)output >= 0xffffffffffUL)
396 error("Destination address too large");
397#else
368 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) 398 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
369 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); 399 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
370 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) 400 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
371 error("Destination address too large"); 401 error("Destination address too large");
372#ifndef CONFIG_RELOCATABLE 402#ifndef CONFIG_RELOCATABLE
373 if ((u32)output != LOAD_PHYSICAL_ADDR) 403 if ((u32)output != LOAD_PHYSICAL_ADDR)
374 error("Wrong destination address"); 404 error("Wrong destination address");
375#endif 405#endif
406#endif
376 407
377 makecrc(); 408 makecrc();
378 putstr("Uncompressing Linux... "); 409 putstr("\nDecompressing Linux... ");
379 gunzip(); 410 gunzip();
380 putstr("Ok, booting the kernel.\n"); 411 putstr("done.\nBooting the kernel.\n");
381 return; 412 return;
382} 413}
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
deleted file mode 100644
index 6ea015aa65e4..000000000000
--- a/arch/x86/boot/compressed/misc_64.c
+++ /dev/null
@@ -1,371 +0,0 @@
1/*
2 * misc.c
3 *
4 * This is a collection of several routines from gzip-1.0.3
5 * adapted for Linux.
6 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */
11
12#define _LINUX_STRING_H_ 1
13#define __LINUX_BITMAP_H 1
14
15#include <linux/linkage.h>
16#include <linux/screen_info.h>
17#include <asm/io.h>
18#include <asm/page.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analyzed.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actual data and 4K of bss.
98 *
99 */
100
101/*
102 * gzip declarations
103 */
104
105#define OF(args) args
106#define STATIC static
107
108#undef memset
109#undef memcpy
110#define memzero(s, n) memset ((s), 0, (n))
111
112typedef unsigned char uch;
113typedef unsigned short ush;
114typedef unsigned long ulg;
115
116#define WSIZE 0x80000000 /* Window size must be at least 32k,
117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
123
124static uch *inbuf; /* input buffer */
125static uch *window; /* Sliding window buffer, (and final output buffer) */
126
127static unsigned insize; /* valid bytes in inbuf */
128static unsigned inptr; /* index of next byte to be processed in inbuf */
129static unsigned outcnt; /* bytes in output buffer */
130
131/* gzip flag byte */
132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
133#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
134#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
135#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
136#define COMMENT 0x10 /* bit 4 set: file comment present */
137#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
138#define RESERVED 0xC0 /* bit 6,7: reserved */
139
140#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
141
142/* Diagnostic functions */
143#ifdef DEBUG
144# define Assert(cond,msg) {if(!(cond)) error(msg);}
145# define Trace(x) fprintf x
146# define Tracev(x) {if (verbose) fprintf x ;}
147# define Tracevv(x) {if (verbose>1) fprintf x ;}
148# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
149# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
150#else
151# define Assert(cond,msg)
152# define Trace(x)
153# define Tracev(x)
154# define Tracevv(x)
155# define Tracec(c,x)
156# define Tracecv(c,x)
157#endif
158
159static int fill_inbuf(void);
160static void flush_window(void);
161static void error(char *m);
162static void gzip_mark(void **);
163static void gzip_release(void **);
164
165/*
166 * This is set up by the setup-routine at boot-time
167 */
168static unsigned char *real_mode; /* Pointer to real-mode data */
169
170#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
171#ifndef STANDARD_MEMORY_BIOS_CALL
172#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
173#endif
174#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
175
176extern unsigned char input_data[];
177extern int input_len;
178
179static long bytes_out = 0;
180
181static void *malloc(int size);
182static void free(void *where);
183
184static void *memset(void *s, int c, unsigned n);
185static void *memcpy(void *dest, const void *src, unsigned n);
186
187static void putstr(const char *);
188
189static long free_mem_ptr;
190static long free_mem_end_ptr;
191
192#define HEAP_SIZE 0x7000
193
194static char *vidmem = (char *)0xb8000;
195static int vidport;
196static int lines, cols;
197
198#include "../../../../lib/inflate.c"
199
200static void *malloc(int size)
201{
202 void *p;
203
204 if (size <0) error("Malloc error");
205 if (free_mem_ptr <= 0) error("Memory error");
206
207 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
208
209 p = (void *)free_mem_ptr;
210 free_mem_ptr += size;
211
212 if (free_mem_ptr >= free_mem_end_ptr)
213 error("Out of memory");
214
215 return p;
216}
217
218static void free(void *where)
219{ /* Don't care */
220}
221
222static void gzip_mark(void **ptr)
223{
224 *ptr = (void *) free_mem_ptr;
225}
226
227static void gzip_release(void **ptr)
228{
229 free_mem_ptr = (long) *ptr;
230}
231
232static void scroll(void)
233{
234 int i;
235
236 memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
237 for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
238 vidmem[i] = ' ';
239}
240
241static void putstr(const char *s)
242{
243 int x,y,pos;
244 char c;
245
246 x = RM_SCREEN_INFO.orig_x;
247 y = RM_SCREEN_INFO.orig_y;
248
249 while ( ( c = *s++ ) != '\0' ) {
250 if ( c == '\n' ) {
251 x = 0;
252 if ( ++y >= lines ) {
253 scroll();
254 y--;
255 }
256 } else {
257 vidmem [ ( x + cols * y ) * 2 ] = c;
258 if ( ++x >= cols ) {
259 x = 0;
260 if ( ++y >= lines ) {
261 scroll();
262 y--;
263 }
264 }
265 }
266 }
267
268 RM_SCREEN_INFO.orig_x = x;
269 RM_SCREEN_INFO.orig_y = y;
270
271 pos = (x + cols * y) * 2; /* Update cursor position */
272 outb_p(14, vidport);
273 outb_p(0xff & (pos >> 9), vidport+1);
274 outb_p(15, vidport);
275 outb_p(0xff & (pos >> 1), vidport+1);
276}
277
278static void* memset(void* s, int c, unsigned n)
279{
280 int i;
281 char *ss = (char*)s;
282
283 for (i=0;i<n;i++) ss[i] = c;
284 return s;
285}
286
287static void* memcpy(void* dest, const void* src, unsigned n)
288{
289 int i;
290 char *d = (char *)dest, *s = (char *)src;
291
292 for (i=0;i<n;i++) d[i] = s[i];
293 return dest;
294}
295
296/* ===========================================================================
297 * Fill the input buffer. This is called only when the buffer is empty
298 * and at least one byte is really needed.
299 */
300static int fill_inbuf(void)
301{
302 error("ran out of input data");
303 return 0;
304}
305
306/* ===========================================================================
307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
308 * (Used for the decompressed data only.)
309 */
310static void flush_window(void)
311{
312 /* With my window equal to my output buffer
313 * I only need to compute the crc here.
314 */
315 ulg c = crc; /* temporary variable */
316 unsigned n;
317 uch *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (ulg)outcnt;
326 outcnt = 0;
327}
328
329static void error(char *x)
330{
331 putstr("\n\n");
332 putstr(x);
333 putstr("\n\n -- System halted");
334
335 while(1); /* Halt */
336}
337
338asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
339 uch *input_data, unsigned long input_len, uch *output)
340{
341 real_mode = rmode;
342
343 if (RM_SCREEN_INFO.orig_video_mode == 7) {
344 vidmem = (char *) 0xb0000;
345 vidport = 0x3b4;
346 } else {
347 vidmem = (char *) 0xb8000;
348 vidport = 0x3d4;
349 }
350
351 lines = RM_SCREEN_INFO.orig_video_lines;
352 cols = RM_SCREEN_INFO.orig_video_cols;
353
354 window = output; /* Output buffer (Normally at 1M) */
355 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + HEAP_SIZE;
357 inbuf = input_data; /* Input buffer */
358 insize = input_len;
359 inptr = 0;
360
361 if ((ulg)output & (__KERNEL_ALIGN - 1))
362 error("Destination address not 2M aligned");
363 if ((ulg)output >= 0xffffffffffUL)
364 error("Destination address too large");
365
366 makecrc();
367 putstr(".\nDecompressing Linux...");
368 gunzip();
369 putstr("done.\nBooting the kernel.\n");
370 return;
371}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 7a0d00b2cf28..d01ea42187e6 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -27,11 +27,6 @@ static unsigned long *relocs;
27 * absolute relocations present w.r.t these symbols. 27 * absolute relocations present w.r.t these symbols.
28 */ 28 */
29static const char* safe_abs_relocs[] = { 29static const char* safe_abs_relocs[] = {
30 "__kernel_vsyscall",
31 "__kernel_rt_sigreturn",
32 "__kernel_sigreturn",
33 "SYSENTER_RETURN",
34 "VDSO_NOTE_MASK",
35 "xen_irq_disable_direct_reloc", 30 "xen_irq_disable_direct_reloc",
36 "xen_save_fl_direct_reloc", 31 "xen_save_fl_direct_reloc",
37}; 32};
@@ -45,6 +40,8 @@ static int is_safe_abs_reloc(const char* sym_name)
45 /* Match found */ 40 /* Match found */
46 return 1; 41 return 1;
47 } 42 }
43 if (strncmp(sym_name, "VDSO", 4) == 0)
44 return 1;
48 if (strncmp(sym_name, "__crc_", 6) == 0) 45 if (strncmp(sym_name, "__crc_", 6) == 0)
49 return 1; 46 return 1;
50 return 0; 47 return 0;
diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux.scr
index bd1429ce193e..f02382ae5c48 100644
--- a/arch/x86/boot/compressed/vmlinux_64.scr
+++ b/arch/x86/boot/compressed/vmlinux.scr
@@ -1,6 +1,6 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .text.compressed : { 3 .rodata.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
index cc4854f6c6c1..bb3c48379c40 100644
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ b/arch/x86/boot/compressed/vmlinux_32.lds
@@ -3,17 +3,17 @@ OUTPUT_ARCH(i386)
3ENTRY(startup_32) 3ENTRY(startup_32)
4SECTIONS 4SECTIONS
5{ 5{
6 /* Be careful parts of head.S assume startup_32 is at 6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0. 7 * address 0.
8 */ 8 */
9 . = 0 ; 9 . = 0;
10 .text.head : { 10 .text.head : {
11 _head = . ; 11 _head = . ;
12 *(.text.head) 12 *(.text.head)
13 _ehead = . ; 13 _ehead = . ;
14 } 14 }
15 .data.compressed : { 15 .rodata.compressed : {
16 *(.data.compressed) 16 *(.rodata.compressed)
17 } 17 }
18 .text : { 18 .text : {
19 _text = .; /* Text */ 19 _text = .; /* Text */
diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr
deleted file mode 100644
index 707a88f7f29e..000000000000
--- a/arch/x86/boot/compressed/vmlinux_32.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .data.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
index 94c13e557fb4..f6e5b445f457 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -3,15 +3,19 @@ OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 3ENTRY(startup_64)
4SECTIONS 4SECTIONS
5{ 5{
6 /* Be careful parts of head.S assume startup_32 is at 6 /* Be careful parts of head_64.S assume startup_64 is at
7 * address 0. 7 * address 0.
8 */ 8 */
9 . = 0; 9 . = 0;
10 .text : { 10 .text.head : {
11 _head = . ; 11 _head = . ;
12 *(.text.head) 12 *(.text.head)
13 _ehead = . ; 13 _ehead = . ;
14 *(.text.compressed) 14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
15 _text = .; /* Text */ 19 _text = .; /* Text */
16 *(.text) 20 *(.text)
17 *(.text.*) 21 *(.text.*)
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index bd138e442ec2..8721dc46a0b6 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -129,6 +129,7 @@ void query_edd(void)
129 char eddarg[8]; 129 char eddarg[8];
130 int do_mbr = 1; 130 int do_mbr = 1;
131 int do_edd = 1; 131 int do_edd = 1;
132 int be_quiet;
132 int devno; 133 int devno;
133 struct edd_info ei, *edp; 134 struct edd_info ei, *edp;
134 u32 *mbrptr; 135 u32 *mbrptr;
@@ -140,12 +141,21 @@ void query_edd(void)
140 do_edd = 0; 141 do_edd = 0;
141 } 142 }
142 143
144 be_quiet = cmdline_find_option_bool("quiet");
145
143 edp = boot_params.eddbuf; 146 edp = boot_params.eddbuf;
144 mbrptr = boot_params.edd_mbr_sig_buffer; 147 mbrptr = boot_params.edd_mbr_sig_buffer;
145 148
146 if (!do_edd) 149 if (!do_edd)
147 return; 150 return;
148 151
152 /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe,
153 * so give a hint if this happens.
154 */
155
156 if (!be_quiet)
157 printf("Probing EDD (edd=off to disable)... ");
158
149 for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { 159 for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
150 /* 160 /*
151 * Scan the BIOS-supported hard disks and query EDD 161 * Scan the BIOS-supported hard disks and query EDD
@@ -162,6 +172,9 @@ void query_edd(void)
162 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++)) 172 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
163 boot_params.edd_mbr_sig_buf_entries = devno-0x80+1; 173 boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
164 } 174 }
175
176 if (!be_quiet)
177 printf("ok\n");
165} 178}
166 179
167#endif 180#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4cc5b0411db5..64ad9016585a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -195,10 +195,13 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
195 # can be located anywhere in 195 # can be located anywhere in
196 # low memory 0x10000 or higher. 196 # low memory 0x10000 or higher.
197 197
198ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff 198ramdisk_max: .long 0x7fffffff
199 # (Header version 0x0203 or later) 199 # (Header version 0x0203 or later)
200 # The highest safe address for 200 # The highest safe address for
201 # the contents of an initrd 201 # the contents of an initrd
202 # The current kernel allows up to 4 GB,
203 # but leave it at 2 GB to avoid
204 # possible bootloader bugs.
202 205
203kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment 206kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
204 #required for protected mode 207 #required for protected mode
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 1f95750ede28..7828da5cfd07 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -100,20 +100,32 @@ static void set_bios_mode(void)
100#endif 100#endif
101} 101}
102 102
103void main(void) 103static void init_heap(void)
104{ 104{
105 /* First, copy the boot header into the "zeropage" */ 105 char *stack_end;
106 copy_boot_params();
107 106
108 /* End of heap check */
109 if (boot_params.hdr.loadflags & CAN_USE_HEAP) { 107 if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
110 heap_end = (char *)(boot_params.hdr.heap_end_ptr 108 asm("leal %P1(%%esp),%0"
111 +0x200-STACK_SIZE); 109 : "=r" (stack_end) : "i" (-STACK_SIZE));
110
111 heap_end = (char *)
112 ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
113 if (heap_end > stack_end)
114 heap_end = stack_end;
112 } else { 115 } else {
113 /* Boot protocol 2.00 only, no heap available */ 116 /* Boot protocol 2.00 only, no heap available */
114 puts("WARNING: Ancient bootloader, some functionality " 117 puts("WARNING: Ancient bootloader, some functionality "
115 "may be limited!\n"); 118 "may be limited!\n");
116 } 119 }
120}
121
122void main(void)
123{
124 /* First, copy the boot header into the "zeropage" */
125 copy_boot_params();
126
127 /* End of heap check */
128 init_heap();
117 129
118 /* Make sure we have all the proper CPU support */ 130 /* Make sure we have all the proper CPU support */
119 if (validate_cpu()) { 131 if (validate_cpu()) {
@@ -131,9 +143,6 @@ void main(void)
131 /* Set keyboard repeat rate (why?) */ 143 /* Set keyboard repeat rate (why?) */
132 keyboard_set_repeat(); 144 keyboard_set_repeat();
133 145
134 /* Set the video mode */
135 set_video();
136
137 /* Query MCA information */ 146 /* Query MCA information */
138 query_mca(); 147 query_mca();
139 148
@@ -154,6 +163,10 @@ void main(void)
154#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 163#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
155 query_edd(); 164 query_edd();
156#endif 165#endif
166
167 /* Set the video mode */
168 set_video();
169
157 /* Do the last things and invoke protected mode */ 170 /* Do the last things and invoke protected mode */
158 go_to_protected_mode(); 171 go_to_protected_mode();
159} 172}
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 09fb342cc62e..1a0f936c160b 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -104,7 +104,7 @@ static void reset_coprocessor(void)
104 (((u64)(base & 0xff000000) << 32) | \ 104 (((u64)(base & 0xff000000) << 32) | \
105 ((u64)flags << 40) | \ 105 ((u64)flags << 40) | \
106 ((u64)(limit & 0x00ff0000) << 32) | \ 106 ((u64)(limit & 0x00ff0000) << 32) | \
107 ((u64)(base & 0x00ffff00) << 16) | \ 107 ((u64)(base & 0x00ffffff) << 16) | \
108 ((u64)(limit & 0x0000ffff))) 108 ((u64)(limit & 0x0000ffff)))
109 109
110struct gdt_ptr { 110struct gdt_ptr {
@@ -121,6 +121,10 @@ static void setup_gdt(void)
121 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), 121 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
122 /* DS: data, read/write, 4 GB, base 0 */ 122 /* DS: data, read/write, 4 GB, base 0 */
123 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), 123 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
124 /* TSS: 32-bit tss, 104 bytes, base 4096 */
125 /* We only have a TSS here to keep Intel VT happy;
126 we don't actually use it for anything. */
127 [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
124 }; 128 };
125 /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead 129 /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
126 of the gdt_ptr contents. Thus, make it static so it will 130 of the gdt_ptr contents. Thus, make it static so it will
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index fa6bed1fac14..f5402d51f7c3 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <asm/boot.h> 17#include <asm/boot.h>
18#include <asm/processor-flags.h>
18#include <asm/segment.h> 19#include <asm/segment.h>
19 20
20 .text 21 .text
@@ -29,28 +30,55 @@
29 */ 30 */
30protected_mode_jump: 31protected_mode_jump:
31 movl %edx, %esi # Pointer to boot_params table 32 movl %edx, %esi # Pointer to boot_params table
32 movl %eax, 2f # Patch ljmpl instruction 33
34 xorl %ebx, %ebx
35 movw %cs, %bx
36 shll $4, %ebx
37 addl %ebx, 2f
33 38
34 movw $__BOOT_DS, %cx 39 movw $__BOOT_DS, %cx
35 xorl %ebx, %ebx # Per the 32-bit boot protocol 40 movw $__BOOT_TSS, %di
36 xorl %ebp, %ebp # Per the 32-bit boot protocol
37 xorl %edi, %edi # Per the 32-bit boot protocol
38 41
39 movl %cr0, %edx 42 movl %cr0, %edx
40 orb $1, %dl # Protected mode (PE) bit 43 orb $X86_CR0_PE, %dl # Protected mode
41 movl %edx, %cr0 44 movl %edx, %cr0
42 jmp 1f # Short jump to serialize on 386/486 45 jmp 1f # Short jump to serialize on 386/486
431: 461:
44 47
45 movw %cx, %ds 48 # Transition to 32-bit mode
46 movw %cx, %es
47 movw %cx, %fs
48 movw %cx, %gs
49 movw %cx, %ss
50
51 # Jump to the 32-bit entrypoint
52 .byte 0x66, 0xea # ljmpl opcode 49 .byte 0x66, 0xea # ljmpl opcode
532: .long 0 # offset 502: .long in_pm32 # offset
54 .word __BOOT_CS # segment 51 .word __BOOT_CS # segment
55 52
56 .size protected_mode_jump, .-protected_mode_jump 53 .size protected_mode_jump, .-protected_mode_jump
54
55 .code32
56 .type in_pm32, @function
57in_pm32:
58 # Set up data segments for flat 32-bit mode
59 movl %ecx, %ds
60 movl %ecx, %es
61 movl %ecx, %fs
62 movl %ecx, %gs
63 movl %ecx, %ss
64 # The 32-bit code sets up its own stack, but this way we do have
65 # a valid stack if some debugging hack wants to use it.
66 addl %ebx, %esp
67
68 # Set up TR to make Intel VT happy
69 ltr %di
70
71 # Clear registers to allow for future extensions to the
72 # 32-bit boot protocol
73 xorl %ecx, %ecx
74 xorl %edx, %edx
75 xorl %ebx, %ebx
76 xorl %ebp, %ebp
77 xorl %edi, %edi
78
79 # Set up LDTR to make Intel VT happy
80 lldt %cx
81
82 jmpl *%eax # Jump to the 32-bit entrypoint
83
84 .size in_pm32, .-in_pm32
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index ed0672a81870..ff664a117096 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -104,6 +104,7 @@ static int bios_probe(void)
104 104
105 mi = GET_HEAP(struct mode_info, 1); 105 mi = GET_HEAP(struct mode_info, 1);
106 mi->mode = VIDEO_FIRST_BIOS+mode; 106 mi->mode = VIDEO_FIRST_BIOS+mode;
107 mi->depth = 0; /* text */
107 mi->x = rdfs16(0x44a); 108 mi->x = rdfs16(0x44a);
108 mi->y = rdfs8(0x484)+1; 109 mi->y = rdfs8(0x484)+1;
109 nmodes++; 110 nmodes++;
@@ -116,7 +117,7 @@ static int bios_probe(void)
116 117
117__videocard video_bios = 118__videocard video_bios =
118{ 119{
119 .card_name = "BIOS (scanned)", 120 .card_name = "BIOS",
120 .probe = bios_probe, 121 .probe = bios_probe,
121 .set_mode = bios_set_mode, 122 .set_mode = bios_set_mode,
122 .unsafe = 1, 123 .unsafe = 1,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4716b9a96357..662dd2f13068 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -79,20 +79,28 @@ static int vesa_probe(void)
79 /* Text Mode, TTY BIOS supported, 79 /* Text Mode, TTY BIOS supported,
80 supported by hardware */ 80 supported by hardware */
81 mi = GET_HEAP(struct mode_info, 1); 81 mi = GET_HEAP(struct mode_info, 1);
82 mi->mode = mode + VIDEO_FIRST_VESA; 82 mi->mode = mode + VIDEO_FIRST_VESA;
83 mi->x = vminfo.h_res; 83 mi->depth = 0; /* text */
84 mi->y = vminfo.v_res; 84 mi->x = vminfo.h_res;
85 mi->y = vminfo.v_res;
85 nmodes++; 86 nmodes++;
86 } else if ((vminfo.mode_attr & 0x99) == 0x99) { 87 } else if ((vminfo.mode_attr & 0x99) == 0x99 &&
88 (vminfo.memory_layout == 4 ||
89 vminfo.memory_layout == 6) &&
90 vminfo.memory_planes == 1) {
87#ifdef CONFIG_FB 91#ifdef CONFIG_FB
88 /* Graphics mode, color, linear frame buffer 92 /* Graphics mode, color, linear frame buffer
89 supported -- register the mode but hide from 93 supported. Only register the mode if
90 the menu. Only do this if framebuffer is 94 if framebuffer is configured, however,
91 configured, however, otherwise the user will 95 otherwise the user will be left without a screen.
92 be left without a screen. */ 96 We don't require CONFIG_FB_VESA, however, since
97 some of the other framebuffer drivers can use
98 this mode-setting, too. */
93 mi = GET_HEAP(struct mode_info, 1); 99 mi = GET_HEAP(struct mode_info, 1);
94 mi->mode = mode + VIDEO_FIRST_VESA; 100 mi->mode = mode + VIDEO_FIRST_VESA;
95 mi->x = mi->y = 0; 101 mi->depth = vminfo.bpp;
102 mi->x = vminfo.h_res;
103 mi->y = vminfo.v_res;
96 nmodes++; 104 nmodes++;
97#endif 105#endif
98 } 106 }
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index aef02f9ec0c1..7259387b7d19 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -18,22 +18,22 @@
18#include "video.h" 18#include "video.h"
19 19
20static struct mode_info vga_modes[] = { 20static struct mode_info vga_modes[] = {
21 { VIDEO_80x25, 80, 25 }, 21 { VIDEO_80x25, 80, 25, 0 },
22 { VIDEO_8POINT, 80, 50 }, 22 { VIDEO_8POINT, 80, 50, 0 },
23 { VIDEO_80x43, 80, 43 }, 23 { VIDEO_80x43, 80, 43, 0 },
24 { VIDEO_80x28, 80, 28 }, 24 { VIDEO_80x28, 80, 28, 0 },
25 { VIDEO_80x30, 80, 30 }, 25 { VIDEO_80x30, 80, 30, 0 },
26 { VIDEO_80x34, 80, 34 }, 26 { VIDEO_80x34, 80, 34, 0 },
27 { VIDEO_80x60, 80, 60 }, 27 { VIDEO_80x60, 80, 60, 0 },
28}; 28};
29 29
30static struct mode_info ega_modes[] = { 30static struct mode_info ega_modes[] = {
31 { VIDEO_80x25, 80, 25 }, 31 { VIDEO_80x25, 80, 25, 0 },
32 { VIDEO_8POINT, 80, 43 }, 32 { VIDEO_8POINT, 80, 43, 0 },
33}; 33};
34 34
35static struct mode_info cga_modes[] = { 35static struct mode_info cga_modes[] = {
36 { VIDEO_80x25, 80, 25 }, 36 { VIDEO_80x25, 80, 25, 0 },
37}; 37};
38 38
39__videocard video_vga; 39__videocard video_vga;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index ad9712f01739..696d08f3843c 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -293,13 +293,28 @@ static void display_menu(void)
293 struct mode_info *mi; 293 struct mode_info *mi;
294 char ch; 294 char ch;
295 int i; 295 int i;
296 int nmodes;
297 int modes_per_line;
298 int col;
296 299
297 puts("Mode: COLSxROWS:\n"); 300 nmodes = 0;
301 for (card = video_cards; card < video_cards_end; card++)
302 nmodes += card->nmodes;
298 303
304 modes_per_line = 1;
305 if (nmodes >= 20)
306 modes_per_line = 3;
307
308 for (col = 0; col < modes_per_line; col++)
309 puts("Mode: Resolution: Type: ");
310 putchar('\n');
311
312 col = 0;
299 ch = '0'; 313 ch = '0';
300 for (card = video_cards; card < video_cards_end; card++) { 314 for (card = video_cards; card < video_cards_end; card++) {
301 mi = card->modes; 315 mi = card->modes;
302 for (i = 0; i < card->nmodes; i++, mi++) { 316 for (i = 0; i < card->nmodes; i++, mi++) {
317 char resbuf[32];
303 int visible = mi->x && mi->y; 318 int visible = mi->x && mi->y;
304 u16 mode_id = mi->mode ? mi->mode : 319 u16 mode_id = mi->mode ? mi->mode :
305 (mi->y << 8)+mi->x; 320 (mi->y << 8)+mi->x;
@@ -307,8 +322,18 @@ static void display_menu(void)
307 if (!visible) 322 if (!visible)
308 continue; /* Hidden mode */ 323 continue; /* Hidden mode */
309 324
310 printf("%c %04X %3dx%-3d %s\n", 325 if (mi->depth)
311 ch, mode_id, mi->x, mi->y, card->card_name); 326 sprintf(resbuf, "%dx%d", mi->y, mi->depth);
327 else
328 sprintf(resbuf, "%d", mi->y);
329
330 printf("%c %03X %4dx%-7s %-6s",
331 ch, mode_id, mi->x, resbuf, card->card_name);
332 col++;
333 if (col >= modes_per_line) {
334 putchar('\n');
335 col = 0;
336 }
312 337
313 if (ch == '9') 338 if (ch == '9')
314 ch = 'a'; 339 ch = 'a';
@@ -318,6 +343,8 @@ static void display_menu(void)
318 ch++; 343 ch++;
319 } 344 }
320 } 345 }
346 if (col)
347 putchar('\n');
321} 348}
322 349
323#define H(x) ((x)-'a'+10) 350#define H(x) ((x)-'a'+10)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index b92447d51213..d69347f79e8e 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -83,7 +83,8 @@ void store_screen(void);
83 83
84struct mode_info { 84struct mode_info {
85 u16 mode; /* Mode number (vga= style) */ 85 u16 mode; /* Mode number (vga= style) */
86 u8 x, y; /* Width, height */ 86 u16 x, y; /* Width, height */
87 u16 depth; /* Bits per pixel, 0 for text mode */
87}; 88};
88 89
89struct card_info { 90struct card_info {
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
index 61c8fe0453be..6499e3239b41 100644
--- a/arch/x86/boot/voyager.c
+++ b/arch/x86/boot/voyager.c
@@ -16,8 +16,6 @@
16 16
17#include "boot.h" 17#include "boot.h"
18 18
19#ifdef CONFIG_X86_VOYAGER
20
21int query_voyager(void) 19int query_voyager(void)
22{ 20{
23 u8 err; 21 u8 err;
@@ -42,5 +40,3 @@ int query_voyager(void)
42 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ 40 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */
43 return 0; 41 return 0;
44} 42}
45
46#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 54ee1764fdae..77562e7cdab6 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -99,9 +99,9 @@ CONFIG_IOSCHED_NOOP=y
99CONFIG_IOSCHED_AS=y 99CONFIG_IOSCHED_AS=y
100CONFIG_IOSCHED_DEADLINE=y 100CONFIG_IOSCHED_DEADLINE=y
101CONFIG_IOSCHED_CFQ=y 101CONFIG_IOSCHED_CFQ=y
102CONFIG_DEFAULT_AS=y 102# CONFIG_DEFAULT_AS is not set
103# CONFIG_DEFAULT_DEADLINE is not set 103# CONFIG_DEFAULT_DEADLINE is not set
104# CONFIG_DEFAULT_CFQ is not set 104CONFIG_DEFAULT_CFQ=y
105# CONFIG_DEFAULT_NOOP is not set 105# CONFIG_DEFAULT_NOOP is not set
106CONFIG_DEFAULT_IOSCHED="anticipatory" 106CONFIG_DEFAULT_IOSCHED="anticipatory"
107 107
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 38a83f9c966f..9e2b0ef851de 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y
145CONFIG_NODES_SHIFT=6 145CONFIG_NODES_SHIFT=6
146CONFIG_X86_64_ACPI_NUMA=y 146CONFIG_X86_64_ACPI_NUMA=y
147CONFIG_NUMA_EMU=y 147CONFIG_NUMA_EMU=y
148CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
149CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
150CONFIG_ARCH_SPARSEMEM_ENABLE=y
151CONFIG_SELECT_MEMORY_MODEL=y
152# CONFIG_FLATMEM_MANUAL is not set
153CONFIG_DISCONTIGMEM_MANUAL=y
154# CONFIG_SPARSEMEM_MANUAL is not set
155CONFIG_DISCONTIGMEM=y
156CONFIG_FLAT_NODE_MEM_MAP=y
157CONFIG_NEED_MULTIPLE_NODES=y 148CONFIG_NEED_MULTIPLE_NODES=y
158# CONFIG_SPARSEMEM_STATIC is not set 149# CONFIG_SPARSEMEM_STATIC is not set
159CONFIG_SPLIT_PTLOCK_CPUS=4 150CONFIG_SPLIT_PTLOCK_CPUS=4
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e2edda255a84..52d0ccfcf6ea 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,9 +2,7 @@
2# Makefile for the ia32 kernel emulation subsystem. 2# Makefile for the ia32 kernel emulation subsystem.
3# 3#
4 4
5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ 5obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
6 ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
7 mmap32.o
8 6
9sysv-$(CONFIG_SYSVIPC) := ipc32.o 7sysv-$(CONFIG_SYSVIPC) := ipc32.o
10obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) 8obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
@@ -13,40 +11,3 @@ obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
13 11
14audit-class-$(CONFIG_AUDIT) := audit.o 12audit-class-$(CONFIG_AUDIT) := audit.o
15obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) 13obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
16
17$(obj)/syscall32_syscall.o: \
18 $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
19
20# Teach kbuild about targets
21targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\
22 $F.o $F.so $F.so.dbg)
23
24# The DSO images are built using a special linker script
25quiet_cmd_syscall = SYSCALL $@
26 cmd_syscall = $(CC) -m32 -nostdlib -shared \
27 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
28 -Wl,-soname=linux-gate.so.1 -o $@ \
29 -Wl,-T,$(filter-out FORCE,$^)
30
31$(obj)/%.so: OBJCOPYFLAGS := -S
32$(obj)/%.so: $(obj)/%.so.dbg FORCE
33 $(call if_changed,objcopy)
34
35$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \
36$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
37 $(call if_changed,syscall)
38
39AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
40AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
41
42vdsos := vdso32-sysenter.so vdso32-syscall.so
43
44quiet_cmd_vdso_install = INSTALL $@
45 cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \
46 $(MODLIB)/vdso/$@
47
48$(vdsos):
49 @mkdir -p $(MODLIB)/vdso
50 $(call cmd,vdso_install)
51
52vdso_install: $(vdsos)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 91b7b5922dfa..5d7b381da692 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -27,7 +27,7 @@ unsigned ia32_signal_class[] = {
27 27
28int ia32_classify_syscall(unsigned syscall) 28int ia32_classify_syscall(unsigned syscall)
29{ 29{
30 switch(syscall) { 30 switch (syscall) {
31 case __NR_open: 31 case __NR_open:
32 return 2; 32 return 2;
33 case __NR_openat: 33 case __NR_openat:
diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c
deleted file mode 100644
index 2c8209a3605a..000000000000
--- a/arch/x86/ia32/fpu32.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
4 * This is used for ptrace, signals and coredumps in 32bit emulation.
5 */
6
7#include <linux/sched.h>
8#include <asm/sigcontext32.h>
9#include <asm/processor.h>
10#include <asm/uaccess.h>
11#include <asm/i387.h>
12
13static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
14{
15 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
16
17 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
18 tmp = ~twd;
19 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
20 /* and move the valid bits to the lower byte. */
21 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
22 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
23 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
24 return tmp;
25}
26
27static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
28{
29 struct _fpxreg *st = NULL;
30 unsigned long tos = (fxsave->swd >> 11) & 7;
31 unsigned long twd = (unsigned long) fxsave->twd;
32 unsigned long tag;
33 unsigned long ret = 0xffff0000;
34 int i;
35
36#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
37
38 for (i = 0 ; i < 8 ; i++) {
39 if (twd & 0x1) {
40 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
41
42 switch (st->exponent & 0x7fff) {
43 case 0x7fff:
44 tag = 2; /* Special */
45 break;
46 case 0x0000:
47 if ( !st->significand[0] &&
48 !st->significand[1] &&
49 !st->significand[2] &&
50 !st->significand[3] ) {
51 tag = 1; /* Zero */
52 } else {
53 tag = 2; /* Special */
54 }
55 break;
56 default:
57 if (st->significand[3] & 0x8000) {
58 tag = 0; /* Valid */
59 } else {
60 tag = 2; /* Special */
61 }
62 break;
63 }
64 } else {
65 tag = 3; /* Empty */
66 }
67 ret |= (tag << (2 * i));
68 twd = twd >> 1;
69 }
70 return ret;
71}
72
73
74static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
75 struct _fpstate_ia32 __user *buf)
76{
77 struct _fpxreg *to;
78 struct _fpreg __user *from;
79 int i;
80 u32 v;
81 int err = 0;
82
83#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
84 G(0, fxsave->cwd);
85 G(1, fxsave->swd);
86 G(2, fxsave->twd);
87 fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
88 G(3, fxsave->rip);
89 G(4, v);
90 fxsave->fop = v>>16; /* cs ignored */
91 G(5, fxsave->rdp);
92 /* 6: ds ignored */
93#undef G
94 if (err)
95 return -1;
96
97 to = (struct _fpxreg *)&fxsave->st_space[0];
98 from = &buf->_st[0];
99 for (i = 0 ; i < 8 ; i++, to++, from++) {
100 if (__copy_from_user(to, from, sizeof(*from)))
101 return -1;
102 }
103 return 0;
104}
105
106
107static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
108 struct i387_fxsave_struct *fxsave,
109 struct pt_regs *regs,
110 struct task_struct *tsk)
111{
112 struct _fpreg __user *to;
113 struct _fpxreg *from;
114 int i;
115 u16 cs,ds;
116 int err = 0;
117
118 if (tsk == current) {
119 /* should be actually ds/cs at fpu exception time,
120 but that information is not available in 64bit mode. */
121 asm("movw %%ds,%0 " : "=r" (ds));
122 asm("movw %%cs,%0 " : "=r" (cs));
123 } else { /* ptrace. task has stopped. */
124 ds = tsk->thread.ds;
125 cs = regs->cs;
126 }
127
128#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
129 P(0, (u32)fxsave->cwd | 0xffff0000);
130 P(1, (u32)fxsave->swd | 0xffff0000);
131 P(2, twd_fxsr_to_i387(fxsave));
132 P(3, (u32)fxsave->rip);
133 P(4, cs | ((u32)fxsave->fop) << 16);
134 P(5, fxsave->rdp);
135 P(6, 0xffff0000 | ds);
136#undef P
137
138 if (err)
139 return -1;
140
141 to = &buf->_st[0];
142 from = (struct _fpxreg *) &fxsave->st_space[0];
143 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
144 if (__copy_to_user(to, from, sizeof(*to)))
145 return -1;
146 }
147 return 0;
148}
149
150int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave)
151{
152 clear_fpu(tsk);
153 if (!fsave) {
154 if (__copy_from_user(&tsk->thread.i387.fxsave,
155 &buf->_fxsr_env[0],
156 sizeof(struct i387_fxsave_struct)))
157 return -1;
158 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
159 set_stopped_child_used_math(tsk);
160 }
161 return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
162}
163
164int save_i387_ia32(struct task_struct *tsk,
165 struct _fpstate_ia32 __user *buf,
166 struct pt_regs *regs,
167 int fsave)
168{
169 int err = 0;
170
171 init_fpu(tsk);
172 if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
173 return -1;
174 if (fsave)
175 return 0;
176 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
177 if (fsave)
178 return err ? -1 : 1;
179 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
180 err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
181 sizeof(struct i387_fxsave_struct));
182 return err ? -1 : 1;
183}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f82e1a94fcb7..e4c12079171b 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -25,6 +25,7 @@
25#include <linux/binfmts.h> 25#include <linux/binfmts.h>
26#include <linux/personality.h> 26#include <linux/personality.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/jiffies.h>
28 29
29#include <asm/system.h> 30#include <asm/system.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -36,61 +37,67 @@
36#undef WARN_OLD 37#undef WARN_OLD
37#undef CORE_DUMP /* probably broken */ 38#undef CORE_DUMP /* probably broken */
38 39
39static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 40static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
40static int load_aout_library(struct file*); 41static int load_aout_library(struct file *);
41 42
42#ifdef CORE_DUMP 43#ifdef CORE_DUMP
43static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); 44static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
45 unsigned long limit);
44 46
45/* 47/*
46 * fill in the user structure for a core dump.. 48 * fill in the user structure for a core dump..
47 */ 49 */
48static void dump_thread32(struct pt_regs * regs, struct user32 * dump) 50static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
49{ 51{
50 u32 fs,gs; 52 u32 fs, gs;
51 53
52/* changed the size calculations - should hopefully work better. lbt */ 54/* changed the size calculations - should hopefully work better. lbt */
53 dump->magic = CMAGIC; 55 dump->magic = CMAGIC;
54 dump->start_code = 0; 56 dump->start_code = 0;
55 dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); 57 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
56 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 58 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
57 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 59 dump->u_dsize = ((unsigned long)
60 (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
58 dump->u_dsize -= dump->u_tsize; 61 dump->u_dsize -= dump->u_tsize;
59 dump->u_ssize = 0; 62 dump->u_ssize = 0;
60 dump->u_debugreg[0] = current->thread.debugreg0; 63 dump->u_debugreg[0] = current->thread.debugreg0;
61 dump->u_debugreg[1] = current->thread.debugreg1; 64 dump->u_debugreg[1] = current->thread.debugreg1;
62 dump->u_debugreg[2] = current->thread.debugreg2; 65 dump->u_debugreg[2] = current->thread.debugreg2;
63 dump->u_debugreg[3] = current->thread.debugreg3; 66 dump->u_debugreg[3] = current->thread.debugreg3;
64 dump->u_debugreg[4] = 0; 67 dump->u_debugreg[4] = 0;
65 dump->u_debugreg[5] = 0; 68 dump->u_debugreg[5] = 0;
66 dump->u_debugreg[6] = current->thread.debugreg6; 69 dump->u_debugreg[6] = current->thread.debugreg6;
67 dump->u_debugreg[7] = current->thread.debugreg7; 70 dump->u_debugreg[7] = current->thread.debugreg7;
68 71
69 if (dump->start_stack < 0xc0000000) 72 if (dump->start_stack < 0xc0000000) {
70 dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; 73 unsigned long tmp;
71 74
72 dump->regs.ebx = regs->rbx; 75 tmp = (unsigned long) (0xc0000000 - dump->start_stack);
73 dump->regs.ecx = regs->rcx; 76 dump->u_ssize = tmp >> PAGE_SHIFT;
74 dump->regs.edx = regs->rdx; 77 }
75 dump->regs.esi = regs->rsi; 78
76 dump->regs.edi = regs->rdi; 79 dump->regs.bx = regs->bx;
77 dump->regs.ebp = regs->rbp; 80 dump->regs.cx = regs->cx;
78 dump->regs.eax = regs->rax; 81 dump->regs.dx = regs->dx;
82 dump->regs.si = regs->si;
83 dump->regs.di = regs->di;
84 dump->regs.bp = regs->bp;
85 dump->regs.ax = regs->ax;
79 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
80 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
81 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
82 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
83 dump->regs.orig_eax = regs->orig_rax; 90 dump->regs.orig_ax = regs->orig_ax;
84 dump->regs.eip = regs->rip; 91 dump->regs.ip = regs->ip;
85 dump->regs.cs = regs->cs; 92 dump->regs.cs = regs->cs;
86 dump->regs.eflags = regs->eflags; 93 dump->regs.flags = regs->flags;
87 dump->regs.esp = regs->rsp; 94 dump->regs.sp = regs->sp;
88 dump->regs.ss = regs->ss; 95 dump->regs.ss = regs->ss;
89 96
90#if 1 /* FIXME */ 97#if 1 /* FIXME */
91 dump->u_fpvalid = 0; 98 dump->u_fpvalid = 0;
92#else 99#else
93 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 100 dump->u_fpvalid = dump_fpu(regs, &dump->i387);
94#endif 101#endif
95} 102}
96 103
@@ -128,15 +135,19 @@ static int dump_write(struct file *file, const void *addr, int nr)
128 return file->f_op->write(file, addr, nr, &file->f_pos) == nr; 135 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
129} 136}
130 137
131#define DUMP_WRITE(addr, nr) \ 138#define DUMP_WRITE(addr, nr) \
132 if (!dump_write(file, (void *)(addr), (nr))) \ 139 if (!dump_write(file, (void *)(addr), (nr))) \
133 goto end_coredump; 140 goto end_coredump;
134 141
135#define DUMP_SEEK(offset) \ 142#define DUMP_SEEK(offset) \
136if (file->f_op->llseek) { \ 143 if (file->f_op->llseek) { \
137 if (file->f_op->llseek(file,(offset),0) != (offset)) \ 144 if (file->f_op->llseek(file, (offset), 0) != (offset)) \
138 goto end_coredump; \ 145 goto end_coredump; \
139} else file->f_pos = (offset) 146 } else \
147 file->f_pos = (offset)
148
149#define START_DATA() (u.u_tsize << PAGE_SHIFT)
150#define START_STACK(u) (u.start_stack)
140 151
141/* 152/*
142 * Routine writes a core dump image in the current directory. 153 * Routine writes a core dump image in the current directory.
@@ -148,62 +159,70 @@ if (file->f_op->llseek) { \
148 * dumping of the process results in another error.. 159 * dumping of the process results in another error..
149 */ 160 */
150 161
151static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 162static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
163 unsigned long limit)
152{ 164{
153 mm_segment_t fs; 165 mm_segment_t fs;
154 int has_dumped = 0; 166 int has_dumped = 0;
155 unsigned long dump_start, dump_size; 167 unsigned long dump_start, dump_size;
156 struct user32 dump; 168 struct user32 dump;
157# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
158# define START_STACK(u) (u.start_stack)
159 169
160 fs = get_fs(); 170 fs = get_fs();
161 set_fs(KERNEL_DS); 171 set_fs(KERNEL_DS);
162 has_dumped = 1; 172 has_dumped = 1;
163 current->flags |= PF_DUMPCORE; 173 current->flags |= PF_DUMPCORE;
164 strncpy(dump.u_comm, current->comm, sizeof(current->comm)); 174 strncpy(dump.u_comm, current->comm, sizeof(current->comm));
165 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); 175 dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) -
176 ((unsigned long)(&dump)));
166 dump.signal = signr; 177 dump.signal = signr;
167 dump_thread32(regs, &dump); 178 dump_thread32(regs, &dump);
168 179
169/* If the size of the dump file exceeds the rlimit, then see what would happen 180 /*
170 if we wrote the stack, but not the data area. */ 181 * If the size of the dump file exceeds the rlimit, then see
182 * what would happen if we wrote the stack, but not the data
183 * area.
184 */
171 if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit) 185 if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit)
172 dump.u_dsize = 0; 186 dump.u_dsize = 0;
173 187
174/* Make sure we have enough room to write the stack and data areas. */ 188 /* Make sure we have enough room to write the stack and data areas. */
175 if ((dump.u_ssize + 1) * PAGE_SIZE > limit) 189 if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
176 dump.u_ssize = 0; 190 dump.u_ssize = 0;
177 191
178/* make sure we actually have a data and stack area to dump */ 192 /* make sure we actually have a data and stack area to dump */
179 set_fs(USER_DS); 193 set_fs(USER_DS);
180 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 194 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
195 dump.u_dsize << PAGE_SHIFT))
181 dump.u_dsize = 0; 196 dump.u_dsize = 0;
182 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 197 if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
198 dump.u_ssize << PAGE_SHIFT))
183 dump.u_ssize = 0; 199 dump.u_ssize = 0;
184 200
185 set_fs(KERNEL_DS); 201 set_fs(KERNEL_DS);
186/* struct user */ 202 /* struct user */
187 DUMP_WRITE(&dump,sizeof(dump)); 203 DUMP_WRITE(&dump, sizeof(dump));
188/* Now dump all of the user data. Include malloced stuff as well */ 204 /* Now dump all of the user data. Include malloced stuff as well */
189 DUMP_SEEK(PAGE_SIZE); 205 DUMP_SEEK(PAGE_SIZE);
190/* now we start writing out the user space info */ 206 /* now we start writing out the user space info */
191 set_fs(USER_DS); 207 set_fs(USER_DS);
192/* Dump the data area */ 208 /* Dump the data area */
193 if (dump.u_dsize != 0) { 209 if (dump.u_dsize != 0) {
194 dump_start = START_DATA(dump); 210 dump_start = START_DATA(dump);
195 dump_size = dump.u_dsize << PAGE_SHIFT; 211 dump_size = dump.u_dsize << PAGE_SHIFT;
196 DUMP_WRITE(dump_start,dump_size); 212 DUMP_WRITE(dump_start, dump_size);
197 } 213 }
198/* Now prepare to dump the stack area */ 214 /* Now prepare to dump the stack area */
199 if (dump.u_ssize != 0) { 215 if (dump.u_ssize != 0) {
200 dump_start = START_STACK(dump); 216 dump_start = START_STACK(dump);
201 dump_size = dump.u_ssize << PAGE_SHIFT; 217 dump_size = dump.u_ssize << PAGE_SHIFT;
202 DUMP_WRITE(dump_start,dump_size); 218 DUMP_WRITE(dump_start, dump_size);
203 } 219 }
204/* Finally dump the task struct. Not be used by gdb, but could be useful */ 220 /*
221 * Finally dump the task struct. Not be used by gdb, but
222 * could be useful
223 */
205 set_fs(KERNEL_DS); 224 set_fs(KERNEL_DS);
206 DUMP_WRITE(current,sizeof(*current)); 225 DUMP_WRITE(current, sizeof(*current));
207end_coredump: 226end_coredump:
208 set_fs(fs); 227 set_fs(fs);
209 return has_dumped; 228 return has_dumped;
@@ -217,35 +236,34 @@ end_coredump:
217 */ 236 */
218static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) 237static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
219{ 238{
220 u32 __user *argv; 239 u32 __user *argv, *envp, *sp;
221 u32 __user *envp; 240 int argc = bprm->argc, envc = bprm->envc;
222 u32 __user *sp;
223 int argc = bprm->argc;
224 int envc = bprm->envc;
225 241
226 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); 242 sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
227 sp -= envc+1; 243 sp -= envc+1;
228 envp = sp; 244 envp = sp;
229 sp -= argc+1; 245 sp -= argc+1;
230 argv = sp; 246 argv = sp;
231 put_user((unsigned long) envp,--sp); 247 put_user((unsigned long) envp, --sp);
232 put_user((unsigned long) argv,--sp); 248 put_user((unsigned long) argv, --sp);
233 put_user(argc,--sp); 249 put_user(argc, --sp);
234 current->mm->arg_start = (unsigned long) p; 250 current->mm->arg_start = (unsigned long) p;
235 while (argc-->0) { 251 while (argc-- > 0) {
236 char c; 252 char c;
237 put_user((u32)(unsigned long)p,argv++); 253
254 put_user((u32)(unsigned long)p, argv++);
238 do { 255 do {
239 get_user(c,p++); 256 get_user(c, p++);
240 } while (c); 257 } while (c);
241 } 258 }
242 put_user(0, argv); 259 put_user(0, argv);
243 current->mm->arg_end = current->mm->env_start = (unsigned long) p; 260 current->mm->arg_end = current->mm->env_start = (unsigned long) p;
244 while (envc-->0) { 261 while (envc-- > 0) {
245 char c; 262 char c;
246 put_user((u32)(unsigned long)p,envp++); 263
264 put_user((u32)(unsigned long)p, envp++);
247 do { 265 do {
248 get_user(c,p++); 266 get_user(c, p++);
249 } while (c); 267 } while (c);
250 } 268 }
251 put_user(0, envp); 269 put_user(0, envp);
@@ -257,20 +275,18 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
257 * These are the functions used to load a.out style executables and shared 275 * These are the functions used to load a.out style executables and shared
258 * libraries. There is no binary dependent code anywhere else. 276 * libraries. There is no binary dependent code anywhere else.
259 */ 277 */
260 278static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
261static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
262{ 279{
280 unsigned long error, fd_offset, rlim;
263 struct exec ex; 281 struct exec ex;
264 unsigned long error;
265 unsigned long fd_offset;
266 unsigned long rlim;
267 int retval; 282 int retval;
268 283
269 ex = *((struct exec *) bprm->buf); /* exec-header */ 284 ex = *((struct exec *) bprm->buf); /* exec-header */
270 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && 285 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
271 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || 286 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
272 N_TRSIZE(ex) || N_DRSIZE(ex) || 287 N_TRSIZE(ex) || N_DRSIZE(ex) ||
273 i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { 288 i_size_read(bprm->file->f_path.dentry->d_inode) <
289 ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
274 return -ENOEXEC; 290 return -ENOEXEC;
275 } 291 }
276 292
@@ -291,13 +307,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
291 if (retval) 307 if (retval)
292 return retval; 308 return retval;
293 309
294 regs->cs = __USER32_CS; 310 regs->cs = __USER32_CS;
295 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = 311 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
296 regs->r13 = regs->r14 = regs->r15 = 0; 312 regs->r13 = regs->r14 = regs->r15 = 0;
297 313
298 /* OK, This is the point of no return */ 314 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 315 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 316 set_thread_flag(TIF_IA32);
301 clear_thread_flag(TIF_ABI_PENDING); 317 clear_thread_flag(TIF_ABI_PENDING);
302 318
303 current->mm->end_code = ex.a_text + 319 current->mm->end_code = ex.a_text +
@@ -311,7 +327,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
311 327
312 current->mm->mmap = NULL; 328 current->mm->mmap = NULL;
313 compute_creds(bprm); 329 compute_creds(bprm);
314 current->flags &= ~PF_FORKNOEXEC; 330 current->flags &= ~PF_FORKNOEXEC;
315 331
316 if (N_MAGIC(ex) == OMAGIC) { 332 if (N_MAGIC(ex) == OMAGIC) {
317 unsigned long text_addr, map_size; 333 unsigned long text_addr, map_size;
@@ -338,30 +354,31 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
338 send_sig(SIGKILL, current, 0); 354 send_sig(SIGKILL, current, 0);
339 return error; 355 return error;
340 } 356 }
341 357
342 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); 358 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
343 } else { 359 } else {
344#ifdef WARN_OLD 360#ifdef WARN_OLD
345 static unsigned long error_time, error_time2; 361 static unsigned long error_time, error_time2;
346 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && 362 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
347 (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) 363 (N_MAGIC(ex) != NMAGIC) &&
348 { 364 time_after(jiffies, error_time2 + 5*HZ)) {
349 printk(KERN_NOTICE "executable not page aligned\n"); 365 printk(KERN_NOTICE "executable not page aligned\n");
350 error_time2 = jiffies; 366 error_time2 = jiffies;
351 } 367 }
352 368
353 if ((fd_offset & ~PAGE_MASK) != 0 && 369 if ((fd_offset & ~PAGE_MASK) != 0 &&
354 (jiffies-error_time) > 5*HZ) 370 time_after(jiffies, error_time + 5*HZ)) {
355 { 371 printk(KERN_WARNING
356 printk(KERN_WARNING 372 "fd_offset is not page aligned. Please convert "
357 "fd_offset is not page aligned. Please convert program: %s\n", 373 "program: %s\n",
358 bprm->file->f_path.dentry->d_name.name); 374 bprm->file->f_path.dentry->d_name.name);
359 error_time = jiffies; 375 error_time = jiffies;
360 } 376 }
361#endif 377#endif
362 378
363 if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { 379 if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
364 loff_t pos = fd_offset; 380 loff_t pos = fd_offset;
381
365 down_write(&current->mm->mmap_sem); 382 down_write(&current->mm->mmap_sem);
366 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); 383 do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
367 up_write(&current->mm->mmap_sem); 384 up_write(&current->mm->mmap_sem);
@@ -376,9 +393,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
376 393
377 down_write(&current->mm->mmap_sem); 394 down_write(&current->mm->mmap_sem);
378 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, 395 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
379 PROT_READ | PROT_EXEC, 396 PROT_READ | PROT_EXEC,
380 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, 397 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
381 fd_offset); 398 MAP_EXECUTABLE | MAP_32BIT,
399 fd_offset);
382 up_write(&current->mm->mmap_sem); 400 up_write(&current->mm->mmap_sem);
383 401
384 if (error != N_TXTADDR(ex)) { 402 if (error != N_TXTADDR(ex)) {
@@ -387,9 +405,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
387 } 405 }
388 406
389 down_write(&current->mm->mmap_sem); 407 down_write(&current->mm->mmap_sem);
390 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, 408 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
391 PROT_READ | PROT_WRITE | PROT_EXEC, 409 PROT_READ | PROT_WRITE | PROT_EXEC,
392 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, 410 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
411 MAP_EXECUTABLE | MAP_32BIT,
393 fd_offset + ex.a_text); 412 fd_offset + ex.a_text);
394 up_write(&current->mm->mmap_sem); 413 up_write(&current->mm->mmap_sem);
395 if (error != N_DATADDR(ex)) { 414 if (error != N_DATADDR(ex)) {
@@ -403,9 +422,9 @@ beyond_if:
403 set_brk(current->mm->start_brk, current->mm->brk); 422 set_brk(current->mm->start_brk, current->mm->brk);
404 423
405 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); 424 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
406 if (retval < 0) { 425 if (retval < 0) {
407 /* Someone check-me: is this error path enough? */ 426 /* Someone check-me: is this error path enough? */
408 send_sig(SIGKILL, current, 0); 427 send_sig(SIGKILL, current, 0);
409 return retval; 428 return retval;
410 } 429 }
411 430
@@ -414,10 +433,10 @@ beyond_if:
414 /* start thread */ 433 /* start thread */
415 asm volatile("movl %0,%%fs" :: "r" (0)); \ 434 asm volatile("movl %0,%%fs" :: "r" (0)); \
416 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 435 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
417 load_gs_index(0); 436 load_gs_index(0);
418 (regs)->rip = ex.a_entry; 437 (regs)->ip = ex.a_entry;
419 (regs)->rsp = current->mm->start_stack; 438 (regs)->sp = current->mm->start_stack;
420 (regs)->eflags = 0x200; 439 (regs)->flags = 0x200;
421 (regs)->cs = __USER32_CS; 440 (regs)->cs = __USER32_CS;
422 (regs)->ss = __USER32_DS; 441 (regs)->ss = __USER32_DS;
423 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 442 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
@@ -425,7 +444,7 @@ beyond_if:
425 set_fs(USER_DS); 444 set_fs(USER_DS);
426 if (unlikely(current->ptrace & PT_PTRACED)) { 445 if (unlikely(current->ptrace & PT_PTRACED)) {
427 if (current->ptrace & PT_TRACE_EXEC) 446 if (current->ptrace & PT_TRACE_EXEC)
428 ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); 447 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
429 else 448 else
430 send_sig(SIGTRAP, current, 0); 449 send_sig(SIGTRAP, current, 0);
431 } 450 }
@@ -434,9 +453,8 @@ beyond_if:
434 453
435static int load_aout_library(struct file *file) 454static int load_aout_library(struct file *file)
436{ 455{
437 struct inode * inode; 456 struct inode *inode;
438 unsigned long bss, start_addr, len; 457 unsigned long bss, start_addr, len, error;
439 unsigned long error;
440 int retval; 458 int retval;
441 struct exec ex; 459 struct exec ex;
442 460
@@ -450,7 +468,8 @@ static int load_aout_library(struct file *file)
450 /* We come in here for the regular a.out style of shared libraries */ 468 /* We come in here for the regular a.out style of shared libraries */
451 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || 469 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
452 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || 470 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
453 i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { 471 i_size_read(inode) <
472 ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
454 goto out; 473 goto out;
455 } 474 }
456 475
@@ -467,10 +486,10 @@ static int load_aout_library(struct file *file)
467 486
468#ifdef WARN_OLD 487#ifdef WARN_OLD
469 static unsigned long error_time; 488 static unsigned long error_time;
470 if ((jiffies-error_time) > 5*HZ) 489 if (time_after(jiffies, error_time + 5*HZ)) {
471 { 490 printk(KERN_WARNING
472 printk(KERN_WARNING 491 "N_TXTOFF is not page aligned. Please convert "
473 "N_TXTOFF is not page aligned. Please convert library: %s\n", 492 "library: %s\n",
474 file->f_path.dentry->d_name.name); 493 file->f_path.dentry->d_name.name);
475 error_time = jiffies; 494 error_time = jiffies;
476 } 495 }
@@ -478,11 +497,12 @@ static int load_aout_library(struct file *file)
478 down_write(&current->mm->mmap_sem); 497 down_write(&current->mm->mmap_sem);
479 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); 498 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
480 up_write(&current->mm->mmap_sem); 499 up_write(&current->mm->mmap_sem);
481 500
482 file->f_op->read(file, (char __user *)start_addr, 501 file->f_op->read(file, (char __user *)start_addr,
483 ex.a_text + ex.a_data, &pos); 502 ex.a_text + ex.a_data, &pos);
484 flush_icache_range((unsigned long) start_addr, 503 flush_icache_range((unsigned long) start_addr,
485 (unsigned long) start_addr + ex.a_text + ex.a_data); 504 (unsigned long) start_addr + ex.a_text +
505 ex.a_data);
486 506
487 retval = 0; 507 retval = 0;
488 goto out; 508 goto out;
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
deleted file mode 100644
index 55822d2cf053..000000000000
--- a/arch/x86/ia32/ia32_binfmt.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * Written 2000,2002 by Andi Kleen.
3 *
4 * Loosely based on the sparc64 and IA64 32bit emulation loaders.
5 * This tricks binfmt_elf.c into loading 32bit binaries using lots
6 * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
7 */
8
9#include <linux/types.h>
10#include <linux/stddef.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/compat.h>
14#include <linux/string.h>
15#include <linux/binfmts.h>
16#include <linux/mm.h>
17#include <linux/security.h>
18#include <linux/elfcore-compat.h>
19
20#include <asm/segment.h>
21#include <asm/ptrace.h>
22#include <asm/processor.h>
23#include <asm/user32.h>
24#include <asm/sigcontext32.h>
25#include <asm/fpu32.h>
26#include <asm/i387.h>
27#include <asm/uaccess.h>
28#include <asm/ia32.h>
29#include <asm/vsyscall32.h>
30
31#undef ELF_ARCH
32#undef ELF_CLASS
33#define ELF_CLASS ELFCLASS32
34#define ELF_ARCH EM_386
35
36#undef elfhdr
37#undef elf_phdr
38#undef elf_note
39#undef elf_addr_t
40#define elfhdr elf32_hdr
41#define elf_phdr elf32_phdr
42#define elf_note elf32_note
43#define elf_addr_t Elf32_Off
44
45#define ELF_NAME "elf/i386"
46
47#define AT_SYSINFO 32
48#define AT_SYSINFO_EHDR 33
49
50int sysctl_vsyscall32 = 1;
51
52#undef ARCH_DLINFO
53#define ARCH_DLINFO do { \
54 if (sysctl_vsyscall32) { \
55 current->mm->context.vdso = (void *)VSYSCALL32_BASE; \
56 NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
57 NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \
58 } \
59} while(0)
60
61struct file;
62
63#define IA32_EMULATOR 1
64
65#undef ELF_ET_DYN_BASE
66
67#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
68
69#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
70
71#define _GET_SEG(x) \
72 ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; })
73
74/* Assumes current==process to be dumped */
75#undef ELF_CORE_COPY_REGS
76#define ELF_CORE_COPY_REGS(pr_reg, regs) \
77 pr_reg[0] = regs->rbx; \
78 pr_reg[1] = regs->rcx; \
79 pr_reg[2] = regs->rdx; \
80 pr_reg[3] = regs->rsi; \
81 pr_reg[4] = regs->rdi; \
82 pr_reg[5] = regs->rbp; \
83 pr_reg[6] = regs->rax; \
84 pr_reg[7] = _GET_SEG(ds); \
85 pr_reg[8] = _GET_SEG(es); \
86 pr_reg[9] = _GET_SEG(fs); \
87 pr_reg[10] = _GET_SEG(gs); \
88 pr_reg[11] = regs->orig_rax; \
89 pr_reg[12] = regs->rip; \
90 pr_reg[13] = regs->cs; \
91 pr_reg[14] = regs->eflags; \
92 pr_reg[15] = regs->rsp; \
93 pr_reg[16] = regs->ss;
94
95
96#define elf_prstatus compat_elf_prstatus
97#define elf_prpsinfo compat_elf_prpsinfo
98#define elf_fpregset_t struct user_i387_ia32_struct
99#define elf_fpxregset_t struct user32_fxsr_struct
100#define user user32
101
102#undef elf_read_implies_exec
103#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X)
104
105#define elf_core_copy_regs elf32_core_copy_regs
106static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs,
107 struct pt_regs *regs)
108{
109 ELF_CORE_COPY_REGS((&elfregs->ebx), regs)
110}
111
112#define elf_core_copy_task_regs elf32_core_copy_task_regs
113static inline int elf32_core_copy_task_regs(struct task_struct *t,
114 compat_elf_gregset_t* elfregs)
115{
116 struct pt_regs *pp = task_pt_regs(t);
117 ELF_CORE_COPY_REGS((&elfregs->ebx), pp);
118 /* fix wrong segments */
119 elfregs->ds = t->thread.ds;
120 elfregs->fs = t->thread.fsindex;
121 elfregs->gs = t->thread.gsindex;
122 elfregs->es = t->thread.es;
123 return 1;
124}
125
126#define elf_core_copy_task_fpregs elf32_core_copy_task_fpregs
127static inline int
128elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs,
129 elf_fpregset_t *fpu)
130{
131 struct _fpstate_ia32 *fpstate = (void*)fpu;
132 mm_segment_t oldfs = get_fs();
133
134 if (!tsk_used_math(tsk))
135 return 0;
136 if (!regs)
137 regs = task_pt_regs(tsk);
138 if (tsk == current)
139 unlazy_fpu(tsk);
140 set_fs(KERNEL_DS);
141 save_i387_ia32(tsk, fpstate, regs, 1);
142 /* Correct for i386 bug. It puts the fop into the upper 16bits of
143 the tag word (like FXSAVE), not into the fcs*/
144 fpstate->cssel |= fpstate->tag & 0xffff0000;
145 set_fs(oldfs);
146 return 1;
147}
148
149#define ELF_CORE_COPY_XFPREGS 1
150#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
151#define elf_core_copy_task_xfpregs elf32_core_copy_task_xfpregs
152static inline int
153elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
154{
155 struct pt_regs *regs = task_pt_regs(t);
156 if (!tsk_used_math(t))
157 return 0;
158 if (t == current)
159 unlazy_fpu(t);
160 memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
161 xfpu->fcs = regs->cs;
162 xfpu->fos = t->thread.ds; /* right? */
163 return 1;
164}
165
166#undef elf_check_arch
167#define elf_check_arch(x) \
168 ((x)->e_machine == EM_386)
169
170extern int force_personality32;
171
172#undef ELF_EXEC_PAGESIZE
173#undef ELF_HWCAP
174#undef ELF_PLATFORM
175#undef SET_PERSONALITY
176#define ELF_EXEC_PAGESIZE PAGE_SIZE
177#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
178#define ELF_PLATFORM ("i686")
179#define SET_PERSONALITY(ex, ibcs2) \
180do { \
181 unsigned long new_flags = 0; \
182 if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
183 new_flags = _TIF_IA32; \
184 if ((current_thread_info()->flags & _TIF_IA32) \
185 != new_flags) \
186 set_thread_flag(TIF_ABI_PENDING); \
187 else \
188 clear_thread_flag(TIF_ABI_PENDING); \
189 /* XXX This overwrites the user set personality */ \
190 current->personality |= force_personality32; \
191} while (0)
192
193/* Override some function names */
194#define elf_format elf32_format
195
196#define init_elf_binfmt init_elf32_binfmt
197#define exit_elf_binfmt exit_elf32_binfmt
198
199#define load_elf_binary load_elf32_binary
200
201#undef ELF_PLAT_INIT
202#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
203
204#undef start_thread
205#define start_thread(regs,new_rip,new_rsp) do { \
206 asm volatile("movl %0,%%fs" :: "r" (0)); \
207 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
208 load_gs_index(0); \
209 (regs)->rip = (new_rip); \
210 (regs)->rsp = (new_rsp); \
211 (regs)->eflags = 0x200; \
212 (regs)->cs = __USER32_CS; \
213 (regs)->ss = __USER32_DS; \
214 set_fs(USER_DS); \
215} while(0)
216
217
218#include <linux/module.h>
219
220MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
221MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
222
223#undef MODULE_DESCRIPTION
224#undef MODULE_AUTHOR
225
226static void elf32_init(struct pt_regs *);
227
228#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
229#define arch_setup_additional_pages syscall32_setup_pages
230extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
231
232#include "../../../fs/binfmt_elf.c"
233
234static void elf32_init(struct pt_regs *regs)
235{
236 struct task_struct *me = current;
237 regs->rdi = 0;
238 regs->rsi = 0;
239 regs->rdx = 0;
240 regs->rcx = 0;
241 regs->rax = 0;
242 regs->rbx = 0;
243 regs->rbp = 0;
244 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
245 regs->r13 = regs->r14 = regs->r15 = 0;
246 me->thread.fs = 0;
247 me->thread.gs = 0;
248 me->thread.fsindex = 0;
249 me->thread.gsindex = 0;
250 me->thread.ds = __USER_DS;
251 me->thread.es = __USER_DS;
252}
253
254#ifdef CONFIG_SYSCTL
255/* Register vsyscall32 into the ABI table */
256#include <linux/sysctl.h>
257
258static ctl_table abi_table2[] = {
259 {
260 .procname = "vsyscall32",
261 .data = &sysctl_vsyscall32,
262 .maxlen = sizeof(int),
263 .mode = 0644,
264 .proc_handler = proc_dointvec
265 },
266 {}
267};
268
269static ctl_table abi_root_table2[] = {
270 {
271 .ctl_name = CTL_ABI,
272 .procname = "abi",
273 .mode = 0555,
274 .child = abi_table2
275 },
276 {}
277};
278
279static __init int ia32_binfmt_init(void)
280{
281 register_sysctl_table(abi_root_table2);
282 return 0;
283}
284__initcall(ia32_binfmt_init);
285#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 6ea19c25f90d..1c0503bdfb1a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -29,9 +29,8 @@
29#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
30#include <asm/user32.h> 30#include <asm/user32.h>
31#include <asm/sigcontext32.h> 31#include <asm/sigcontext32.h>
32#include <asm/fpu32.h>
33#include <asm/proto.h> 32#include <asm/proto.h>
34#include <asm/vsyscall32.h> 33#include <asm/vdso.h>
35 34
36#define DEBUG_SIG 0 35#define DEBUG_SIG 0
37 36
@@ -43,7 +42,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
43int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 42int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
44{ 43{
45 int err; 44 int err;
46 if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 45
46 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
47 return -EFAULT; 47 return -EFAULT;
48 48
49 /* If you change siginfo_t structure, please make sure that 49 /* If you change siginfo_t structure, please make sure that
@@ -53,16 +53,19 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
53 3 ints plus the relevant union member. */ 53 3 ints plus the relevant union member. */
54 err = __put_user(from->si_signo, &to->si_signo); 54 err = __put_user(from->si_signo, &to->si_signo);
55 err |= __put_user(from->si_errno, &to->si_errno); 55 err |= __put_user(from->si_errno, &to->si_errno);
56 err |= __put_user((short)from->si_code, &to->si_code); 56 err |= __put_user((short)from->si_code, &to->si_code);
57 57
58 if (from->si_code < 0) { 58 if (from->si_code < 0) {
59 err |= __put_user(from->si_pid, &to->si_pid); 59 err |= __put_user(from->si_pid, &to->si_pid);
60 err |= __put_user(from->si_uid, &to->si_uid); 60 err |= __put_user(from->si_uid, &to->si_uid);
61 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); 61 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
62 } else { 62 } else {
63 /* First 32bits of unions are always present: 63 /*
64 * si_pid === si_band === si_tid === si_addr(LS half) */ 64 * First 32bits of unions are always present:
65 err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); 65 * si_pid === si_band === si_tid === si_addr(LS half)
66 */
67 err |= __put_user(from->_sifields._pad[0],
68 &to->_sifields._pad[0]);
66 switch (from->si_code >> 16) { 69 switch (from->si_code >> 16) {
67 case __SI_FAULT >> 16: 70 case __SI_FAULT >> 16:
68 break; 71 break;
@@ -76,14 +79,15 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
76 err |= __put_user(from->si_uid, &to->si_uid); 79 err |= __put_user(from->si_uid, &to->si_uid);
77 break; 80 break;
78 case __SI_POLL >> 16: 81 case __SI_POLL >> 16:
79 err |= __put_user(from->si_fd, &to->si_fd); 82 err |= __put_user(from->si_fd, &to->si_fd);
80 break; 83 break;
81 case __SI_TIMER >> 16: 84 case __SI_TIMER >> 16:
82 err |= __put_user(from->si_overrun, &to->si_overrun); 85 err |= __put_user(from->si_overrun, &to->si_overrun);
83 err |= __put_user(ptr_to_compat(from->si_ptr), 86 err |= __put_user(ptr_to_compat(from->si_ptr),
84 &to->si_ptr); 87 &to->si_ptr);
85 break; 88 break;
86 case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ 89 /* This is not generated by the kernel as of now. */
90 case __SI_RT >> 16:
87 case __SI_MESGQ >> 16: 91 case __SI_MESGQ >> 16:
88 err |= __put_user(from->si_uid, &to->si_uid); 92 err |= __put_user(from->si_uid, &to->si_uid);
89 err |= __put_user(from->si_int, &to->si_int); 93 err |= __put_user(from->si_int, &to->si_int);
@@ -97,7 +101,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
97{ 101{
98 int err; 102 int err;
99 u32 ptr32; 103 u32 ptr32;
100 if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) 104
105 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
101 return -EFAULT; 106 return -EFAULT;
102 107
103 err = __get_user(to->si_signo, &from->si_signo); 108 err = __get_user(to->si_signo, &from->si_signo);
@@ -112,8 +117,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
112 return err; 117 return err;
113} 118}
114 119
115asmlinkage long 120asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
116sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
117{ 121{
118 mask &= _BLOCKABLE; 122 mask &= _BLOCKABLE;
119 spin_lock_irq(&current->sighand->siglock); 123 spin_lock_irq(&current->sighand->siglock);
@@ -128,36 +132,37 @@ sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
128 return -ERESTARTNOHAND; 132 return -ERESTARTNOHAND;
129} 133}
130 134
131asmlinkage long 135asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
132sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, 136 stack_ia32_t __user *uoss_ptr,
133 stack_ia32_t __user *uoss_ptr, 137 struct pt_regs *regs)
134 struct pt_regs *regs)
135{ 138{
136 stack_t uss,uoss; 139 stack_t uss, uoss;
137 int ret; 140 int ret;
138 mm_segment_t seg; 141 mm_segment_t seg;
139 if (uss_ptr) { 142
143 if (uss_ptr) {
140 u32 ptr; 144 u32 ptr;
141 memset(&uss,0,sizeof(stack_t)); 145
142 if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || 146 memset(&uss, 0, sizeof(stack_t));
147 if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) ||
143 __get_user(ptr, &uss_ptr->ss_sp) || 148 __get_user(ptr, &uss_ptr->ss_sp) ||
144 __get_user(uss.ss_flags, &uss_ptr->ss_flags) || 149 __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
145 __get_user(uss.ss_size, &uss_ptr->ss_size)) 150 __get_user(uss.ss_size, &uss_ptr->ss_size))
146 return -EFAULT; 151 return -EFAULT;
147 uss.ss_sp = compat_ptr(ptr); 152 uss.ss_sp = compat_ptr(ptr);
148 } 153 }
149 seg = get_fs(); 154 seg = get_fs();
150 set_fs(KERNEL_DS); 155 set_fs(KERNEL_DS);
151 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); 156 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
152 set_fs(seg); 157 set_fs(seg);
153 if (ret >= 0 && uoss_ptr) { 158 if (ret >= 0 && uoss_ptr) {
154 if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || 159 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
155 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || 160 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
156 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || 161 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
157 __put_user(uoss.ss_size, &uoss_ptr->ss_size)) 162 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
158 ret = -EFAULT; 163 ret = -EFAULT;
159 } 164 }
160 return ret; 165 return ret;
161} 166}
162 167
163/* 168/*
@@ -186,87 +191,85 @@ struct rt_sigframe
186 char retcode[8]; 191 char retcode[8];
187}; 192};
188 193
189static int 194#define COPY(x) { \
190ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) 195 unsigned int reg; \
196 err |= __get_user(reg, &sc->x); \
197 regs->x = reg; \
198}
199
200#define RELOAD_SEG(seg,mask) \
201 { unsigned int cur; \
202 unsigned short pre; \
203 err |= __get_user(pre, &sc->seg); \
204 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
205 pre |= mask; \
206 if (pre != cur) loadsegment(seg, pre); }
207
208static int ia32_restore_sigcontext(struct pt_regs *regs,
209 struct sigcontext_ia32 __user *sc,
210 unsigned int *peax)
191{ 211{
192 unsigned int err = 0; 212 unsigned int tmpflags, gs, oldgs, err = 0;
193 213 struct _fpstate_ia32 __user *buf;
214 u32 tmp;
215
194 /* Always make any pending restarted system calls return -EINTR */ 216 /* Always make any pending restarted system calls return -EINTR */
195 current_thread_info()->restart_block.fn = do_no_restart_syscall; 217 current_thread_info()->restart_block.fn = do_no_restart_syscall;
196 218
197#if DEBUG_SIG 219#if DEBUG_SIG
198 printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", 220 printk(KERN_DEBUG "SIG restore_sigcontext: "
199 sc, sc->err, sc->eip, sc->cs, sc->eflags); 221 "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
222 sc, sc->err, sc->ip, sc->cs, sc->flags);
200#endif 223#endif
201#define COPY(x) { \
202 unsigned int reg; \
203 err |= __get_user(reg, &sc->e ##x); \
204 regs->r ## x = reg; \
205}
206 224
207#define RELOAD_SEG(seg,mask) \ 225 /*
208 { unsigned int cur; \ 226 * Reload fs and gs if they have changed in the signal
209 unsigned short pre; \ 227 * handler. This does not handle long fs/gs base changes in
210 err |= __get_user(pre, &sc->seg); \ 228 * the handler, but does not clobber them at least in the
211 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 229 * normal case.
212 pre |= mask; \ 230 */
213 if (pre != cur) loadsegment(seg,pre); } 231 err |= __get_user(gs, &sc->gs);
214 232 gs |= 3;
215 /* Reload fs and gs if they have changed in the signal handler. 233 asm("movl %%gs,%0" : "=r" (oldgs));
216 This does not handle long fs/gs base changes in the handler, but 234 if (gs != oldgs)
217 does not clobber them at least in the normal case. */ 235 load_gs_index(gs);
218 236
219 { 237 RELOAD_SEG(fs, 3);
220 unsigned gs, oldgs; 238 RELOAD_SEG(ds, 3);
221 err |= __get_user(gs, &sc->gs); 239 RELOAD_SEG(es, 3);
222 gs |= 3;
223 asm("movl %%gs,%0" : "=r" (oldgs));
224 if (gs != oldgs)
225 load_gs_index(gs);
226 }
227 RELOAD_SEG(fs,3);
228 RELOAD_SEG(ds,3);
229 RELOAD_SEG(es,3);
230 240
231 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 241 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
232 COPY(dx); COPY(cx); COPY(ip); 242 COPY(dx); COPY(cx); COPY(ip);
233 /* Don't touch extended registers */ 243 /* Don't touch extended registers */
234 244
235 err |= __get_user(regs->cs, &sc->cs); 245 err |= __get_user(regs->cs, &sc->cs);
236 regs->cs |= 3; 246 regs->cs |= 3;
237 err |= __get_user(regs->ss, &sc->ss); 247 err |= __get_user(regs->ss, &sc->ss);
238 regs->ss |= 3; 248 regs->ss |= 3;
239 249
240 { 250 err |= __get_user(tmpflags, &sc->flags);
241 unsigned int tmpflags; 251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
242 err |= __get_user(tmpflags, &sc->eflags); 252 /* disable syscall checks */
243 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); 253 regs->orig_ax = -1;
244 regs->orig_rax = -1; /* disable syscall checks */ 254
245 } 255 err |= __get_user(tmp, &sc->fpstate);
256 buf = compat_ptr(tmp);
257 if (buf) {
258 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
259 goto badframe;
260 err |= restore_i387_ia32(buf);
261 } else {
262 struct task_struct *me = current;
246 263
247 { 264 if (used_math()) {
248 u32 tmp; 265 clear_fpu(me);
249 struct _fpstate_ia32 __user * buf; 266 clear_used_math();
250 err |= __get_user(tmp, &sc->fpstate);
251 buf = compat_ptr(tmp);
252 if (buf) {
253 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
254 goto badframe;
255 err |= restore_i387_ia32(current, buf, 0);
256 } else {
257 struct task_struct *me = current;
258 if (used_math()) {
259 clear_fpu(me);
260 clear_used_math();
261 }
262 } 267 }
263 } 268 }
264 269
265 { 270 err |= __get_user(tmp, &sc->ax);
266 u32 tmp; 271 *peax = tmp;
267 err |= __get_user(tmp, &sc->eax); 272
268 *peax = tmp;
269 }
270 return err; 273 return err;
271 274
272badframe: 275badframe:
@@ -275,15 +278,16 @@ badframe:
275 278
276asmlinkage long sys32_sigreturn(struct pt_regs *regs) 279asmlinkage long sys32_sigreturn(struct pt_regs *regs)
277{ 280{
278 struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); 281 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
279 sigset_t set; 282 sigset_t set;
280 unsigned int eax; 283 unsigned int ax;
281 284
282 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 285 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
283 goto badframe; 286 goto badframe;
284 if (__get_user(set.sig[0], &frame->sc.oldmask) 287 if (__get_user(set.sig[0], &frame->sc.oldmask)
285 || (_COMPAT_NSIG_WORDS > 1 288 || (_COMPAT_NSIG_WORDS > 1
286 && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, 289 && __copy_from_user((((char *) &set.sig) + 4),
290 &frame->extramask,
287 sizeof(frame->extramask)))) 291 sizeof(frame->extramask))))
288 goto badframe; 292 goto badframe;
289 293
@@ -292,24 +296,24 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
292 current->blocked = set; 296 current->blocked = set;
293 recalc_sigpending(); 297 recalc_sigpending();
294 spin_unlock_irq(&current->sighand->siglock); 298 spin_unlock_irq(&current->sighand->siglock);
295 299
296 if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) 300 if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
297 goto badframe; 301 goto badframe;
298 return eax; 302 return ax;
299 303
300badframe: 304badframe:
301 signal_fault(regs, frame, "32bit sigreturn"); 305 signal_fault(regs, frame, "32bit sigreturn");
302 return 0; 306 return 0;
303} 307}
304 308
305asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) 309asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
306{ 310{
307 struct rt_sigframe __user *frame; 311 struct rt_sigframe __user *frame;
308 sigset_t set; 312 sigset_t set;
309 unsigned int eax; 313 unsigned int ax;
310 struct pt_regs tregs; 314 struct pt_regs tregs;
311 315
312 frame = (struct rt_sigframe __user *)(regs->rsp - 4); 316 frame = (struct rt_sigframe __user *)(regs->sp - 4);
313 317
314 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 318 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
315 goto badframe; 319 goto badframe;
@@ -321,28 +325,28 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
321 current->blocked = set; 325 current->blocked = set;
322 recalc_sigpending(); 326 recalc_sigpending();
323 spin_unlock_irq(&current->sighand->siglock); 327 spin_unlock_irq(&current->sighand->siglock);
324 328
325 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 329 if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
326 goto badframe; 330 goto badframe;
327 331
328 tregs = *regs; 332 tregs = *regs;
329 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) 333 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
330 goto badframe; 334 goto badframe;
331 335
332 return eax; 336 return ax;
333 337
334badframe: 338badframe:
335 signal_fault(regs,frame,"32bit rt sigreturn"); 339 signal_fault(regs, frame, "32bit rt sigreturn");
336 return 0; 340 return 0;
337} 341}
338 342
339/* 343/*
340 * Set up a signal frame. 344 * Set up a signal frame.
341 */ 345 */
342 346
343static int 347static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
344ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, 348 struct _fpstate_ia32 __user *fpstate,
345 struct pt_regs *regs, unsigned int mask) 349 struct pt_regs *regs, unsigned int mask)
346{ 350{
347 int tmp, err = 0; 351 int tmp, err = 0;
348 352
@@ -356,26 +360,26 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
356 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 360 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
357 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 361 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
358 362
359 err |= __put_user((u32)regs->rdi, &sc->edi); 363 err |= __put_user((u32)regs->di, &sc->di);
360 err |= __put_user((u32)regs->rsi, &sc->esi); 364 err |= __put_user((u32)regs->si, &sc->si);
361 err |= __put_user((u32)regs->rbp, &sc->ebp); 365 err |= __put_user((u32)regs->bp, &sc->bp);
362 err |= __put_user((u32)regs->rsp, &sc->esp); 366 err |= __put_user((u32)regs->sp, &sc->sp);
363 err |= __put_user((u32)regs->rbx, &sc->ebx); 367 err |= __put_user((u32)regs->bx, &sc->bx);
364 err |= __put_user((u32)regs->rdx, &sc->edx); 368 err |= __put_user((u32)regs->dx, &sc->dx);
365 err |= __put_user((u32)regs->rcx, &sc->ecx); 369 err |= __put_user((u32)regs->cx, &sc->cx);
366 err |= __put_user((u32)regs->rax, &sc->eax); 370 err |= __put_user((u32)regs->ax, &sc->ax);
367 err |= __put_user((u32)regs->cs, &sc->cs); 371 err |= __put_user((u32)regs->cs, &sc->cs);
368 err |= __put_user((u32)regs->ss, &sc->ss); 372 err |= __put_user((u32)regs->ss, &sc->ss);
369 err |= __put_user(current->thread.trap_no, &sc->trapno); 373 err |= __put_user(current->thread.trap_no, &sc->trapno);
370 err |= __put_user(current->thread.error_code, &sc->err); 374 err |= __put_user(current->thread.error_code, &sc->err);
371 err |= __put_user((u32)regs->rip, &sc->eip); 375 err |= __put_user((u32)regs->ip, &sc->ip);
372 err |= __put_user((u32)regs->eflags, &sc->eflags); 376 err |= __put_user((u32)regs->flags, &sc->flags);
373 err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); 377 err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
374 378
375 tmp = save_i387_ia32(current, fpstate, regs, 0); 379 tmp = save_i387_ia32(fpstate);
376 if (tmp < 0) 380 if (tmp < 0)
377 err = -EFAULT; 381 err = -EFAULT;
378 else { 382 else {
379 clear_used_math(); 383 clear_used_math();
380 stts(); 384 stts();
381 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 385 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
@@ -392,40 +396,53 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
392/* 396/*
393 * Determine which stack to use.. 397 * Determine which stack to use..
394 */ 398 */
395static void __user * 399static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
396get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) 400 size_t frame_size)
397{ 401{
398 unsigned long rsp; 402 unsigned long sp;
399 403
400 /* Default to using normal stack */ 404 /* Default to using normal stack */
401 rsp = regs->rsp; 405 sp = regs->sp;
402 406
403 /* This is the X/Open sanctioned signal stack switching. */ 407 /* This is the X/Open sanctioned signal stack switching. */
404 if (ka->sa.sa_flags & SA_ONSTACK) { 408 if (ka->sa.sa_flags & SA_ONSTACK) {
405 if (sas_ss_flags(rsp) == 0) 409 if (sas_ss_flags(sp) == 0)
406 rsp = current->sas_ss_sp + current->sas_ss_size; 410 sp = current->sas_ss_sp + current->sas_ss_size;
407 } 411 }
408 412
409 /* This is the legacy signal stack switching. */ 413 /* This is the legacy signal stack switching. */
410 else if ((regs->ss & 0xffff) != __USER_DS && 414 else if ((regs->ss & 0xffff) != __USER_DS &&
411 !(ka->sa.sa_flags & SA_RESTORER) && 415 !(ka->sa.sa_flags & SA_RESTORER) &&
412 ka->sa.sa_restorer) { 416 ka->sa.sa_restorer)
413 rsp = (unsigned long) ka->sa.sa_restorer; 417 sp = (unsigned long) ka->sa.sa_restorer;
414 }
415 418
416 rsp -= frame_size; 419 sp -= frame_size;
417 /* Align the stack pointer according to the i386 ABI, 420 /* Align the stack pointer according to the i386 ABI,
418 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 421 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
419 rsp = ((rsp + 4) & -16ul) - 4; 422 sp = ((sp + 4) & -16ul) - 4;
420 return (void __user *) rsp; 423 return (void __user *) sp;
421} 424}
422 425
423int ia32_setup_frame(int sig, struct k_sigaction *ka, 426int ia32_setup_frame(int sig, struct k_sigaction *ka,
424 compat_sigset_t *set, struct pt_regs * regs) 427 compat_sigset_t *set, struct pt_regs *regs)
425{ 428{
426 struct sigframe __user *frame; 429 struct sigframe __user *frame;
430 void __user *restorer;
427 int err = 0; 431 int err = 0;
428 432
433 /* copy_to_user optimizes that into a single 8 byte store */
434 static const struct {
435 u16 poplmovl;
436 u32 val;
437 u16 int80;
438 u16 pad;
439 } __attribute__((packed)) code = {
440 0xb858, /* popl %eax ; movl $...,%eax */
441 __NR_ia32_sigreturn,
442 0x80cd, /* int $0x80 */
443 0,
444 };
445
429 frame = get_sigframe(ka, regs, sizeof(*frame)); 446 frame = get_sigframe(ka, regs, sizeof(*frame));
430 447
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 448 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
@@ -443,64 +460,53 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
443 if (_COMPAT_NSIG_WORDS > 1) { 460 if (_COMPAT_NSIG_WORDS > 1) {
444 err |= __copy_to_user(frame->extramask, &set->sig[1], 461 err |= __copy_to_user(frame->extramask, &set->sig[1],
445 sizeof(frame->extramask)); 462 sizeof(frame->extramask));
463 if (err)
464 goto give_sigsegv;
446 } 465 }
447 if (err)
448 goto give_sigsegv;
449 466
450 /* Return stub is in 32bit vsyscall page */ 467 if (ka->sa.sa_flags & SA_RESTORER) {
451 { 468 restorer = ka->sa.sa_restorer;
452 void __user *restorer; 469 } else {
470 /* Return stub is in 32bit vsyscall page */
453 if (current->binfmt->hasvdso) 471 if (current->binfmt->hasvdso)
454 restorer = VSYSCALL32_SIGRETURN; 472 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
473 sigreturn);
455 else 474 else
456 restorer = (void *)&frame->retcode; 475 restorer = &frame->retcode;
457 if (ka->sa.sa_flags & SA_RESTORER)
458 restorer = ka->sa.sa_restorer;
459 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
460 }
461 /* These are actually not used anymore, but left because some
462 gdb versions depend on them as a marker. */
463 {
464 /* copy_to_user optimizes that into a single 8 byte store */
465 static const struct {
466 u16 poplmovl;
467 u32 val;
468 u16 int80;
469 u16 pad;
470 } __attribute__((packed)) code = {
471 0xb858, /* popl %eax ; movl $...,%eax */
472 __NR_ia32_sigreturn,
473 0x80cd, /* int $0x80 */
474 0,
475 };
476 err |= __copy_to_user(frame->retcode, &code, 8);
477 } 476 }
477 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
478
479 /*
480 * These are actually not used anymore, but left because some
481 * gdb versions depend on them as a marker.
482 */
483 err |= __copy_to_user(frame->retcode, &code, 8);
478 if (err) 484 if (err)
479 goto give_sigsegv; 485 goto give_sigsegv;
480 486
481 /* Set up registers for signal handler */ 487 /* Set up registers for signal handler */
482 regs->rsp = (unsigned long) frame; 488 regs->sp = (unsigned long) frame;
483 regs->rip = (unsigned long) ka->sa.sa_handler; 489 regs->ip = (unsigned long) ka->sa.sa_handler;
484 490
485 /* Make -mregparm=3 work */ 491 /* Make -mregparm=3 work */
486 regs->rax = sig; 492 regs->ax = sig;
487 regs->rdx = 0; 493 regs->dx = 0;
488 regs->rcx = 0; 494 regs->cx = 0;
489 495
490 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 496 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
491 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 497 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
492 498
493 regs->cs = __USER32_CS; 499 regs->cs = __USER32_CS;
494 regs->ss = __USER32_DS; 500 regs->ss = __USER32_DS;
495 501
496 set_fs(USER_DS); 502 set_fs(USER_DS);
497 regs->eflags &= ~TF_MASK; 503 regs->flags &= ~X86_EFLAGS_TF;
498 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
499 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
500 506
501#if DEBUG_SIG 507#if DEBUG_SIG
502 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", 508 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
503 current->comm, current->pid, frame, regs->rip, frame->pretcode); 509 current->comm, current->pid, frame, regs->ip, frame->pretcode);
504#endif 510#endif
505 511
506 return 0; 512 return 0;
@@ -511,25 +517,34 @@ give_sigsegv:
511} 517}
512 518
513int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 519int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
514 compat_sigset_t *set, struct pt_regs * regs) 520 compat_sigset_t *set, struct pt_regs *regs)
515{ 521{
516 struct rt_sigframe __user *frame; 522 struct rt_sigframe __user *frame;
523 struct exec_domain *ed = current_thread_info()->exec_domain;
524 void __user *restorer;
517 int err = 0; 525 int err = 0;
518 526
527 /* __copy_to_user optimizes that into a single 8 byte store */
528 static const struct {
529 u8 movl;
530 u32 val;
531 u16 int80;
532 u16 pad;
533 u8 pad2;
534 } __attribute__((packed)) code = {
535 0xb8,
536 __NR_ia32_rt_sigreturn,
537 0x80cd,
538 0,
539 };
540
519 frame = get_sigframe(ka, regs, sizeof(*frame)); 541 frame = get_sigframe(ka, regs, sizeof(*frame));
520 542
521 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 543 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
522 goto give_sigsegv; 544 goto give_sigsegv;
523 545
524 { 546 err |= __put_user((ed && ed->signal_invmap && sig < 32
525 struct exec_domain *ed = current_thread_info()->exec_domain; 547 ? ed->signal_invmap[sig] : sig), &frame->sig);
526 err |= __put_user((ed
527 && ed->signal_invmap
528 && sig < 32
529 ? ed->signal_invmap[sig]
530 : sig),
531 &frame->sig);
532 }
533 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 548 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
534 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 549 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
535 err |= copy_siginfo_to_user32(&frame->info, info); 550 err |= copy_siginfo_to_user32(&frame->info, info);
@@ -540,73 +555,58 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
540 err |= __put_user(0, &frame->uc.uc_flags); 555 err |= __put_user(0, &frame->uc.uc_flags);
541 err |= __put_user(0, &frame->uc.uc_link); 556 err |= __put_user(0, &frame->uc.uc_link);
542 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 557 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
543 err |= __put_user(sas_ss_flags(regs->rsp), 558 err |= __put_user(sas_ss_flags(regs->sp),
544 &frame->uc.uc_stack.ss_flags); 559 &frame->uc.uc_stack.ss_flags);
545 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 560 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
546 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 561 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
547 regs, set->sig[0]); 562 regs, set->sig[0]);
548 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 563 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
549 if (err) 564 if (err)
550 goto give_sigsegv; 565 goto give_sigsegv;
551 566
552 567 if (ka->sa.sa_flags & SA_RESTORER)
553 { 568 restorer = ka->sa.sa_restorer;
554 void __user *restorer = VSYSCALL32_RTSIGRETURN; 569 else
555 if (ka->sa.sa_flags & SA_RESTORER) 570 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
556 restorer = ka->sa.sa_restorer; 571 rt_sigreturn);
557 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); 572 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
558 } 573
559 574 /*
560 /* This is movl $,%eax ; int $0x80 */ 575 * Not actually used anymore, but left because some gdb
561 /* Not actually used anymore, but left because some gdb versions 576 * versions need it.
562 need it. */ 577 */
563 { 578 err |= __copy_to_user(frame->retcode, &code, 8);
564 /* __copy_to_user optimizes that into a single 8 byte store */
565 static const struct {
566 u8 movl;
567 u32 val;
568 u16 int80;
569 u16 pad;
570 u8 pad2;
571 } __attribute__((packed)) code = {
572 0xb8,
573 __NR_ia32_rt_sigreturn,
574 0x80cd,
575 0,
576 };
577 err |= __copy_to_user(frame->retcode, &code, 8);
578 }
579 if (err) 579 if (err)
580 goto give_sigsegv; 580 goto give_sigsegv;
581 581
582 /* Set up registers for signal handler */ 582 /* Set up registers for signal handler */
583 regs->rsp = (unsigned long) frame; 583 regs->sp = (unsigned long) frame;
584 regs->rip = (unsigned long) ka->sa.sa_handler; 584 regs->ip = (unsigned long) ka->sa.sa_handler;
585 585
586 /* Make -mregparm=3 work */ 586 /* Make -mregparm=3 work */
587 regs->rax = sig; 587 regs->ax = sig;
588 regs->rdx = (unsigned long) &frame->info; 588 regs->dx = (unsigned long) &frame->info;
589 regs->rcx = (unsigned long) &frame->uc; 589 regs->cx = (unsigned long) &frame->uc;
590 590
591 /* Make -mregparm=3 work */ 591 /* Make -mregparm=3 work */
592 regs->rax = sig; 592 regs->ax = sig;
593 regs->rdx = (unsigned long) &frame->info; 593 regs->dx = (unsigned long) &frame->info;
594 regs->rcx = (unsigned long) &frame->uc; 594 regs->cx = (unsigned long) &frame->uc;
595
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
595 598
596 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 599 regs->cs = __USER32_CS;
597 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 600 regs->ss = __USER32_DS;
598
599 regs->cs = __USER32_CS;
600 regs->ss = __USER32_DS;
601 601
602 set_fs(USER_DS); 602 set_fs(USER_DS);
603 regs->eflags &= ~TF_MASK; 603 regs->flags &= ~X86_EFLAGS_TF;
604 if (test_thread_flag(TIF_SINGLESTEP)) 604 if (test_thread_flag(TIF_SINGLESTEP))
605 ptrace_notify(SIGTRAP); 605 ptrace_notify(SIGTRAP);
606 606
607#if DEBUG_SIG 607#if DEBUG_SIG
608 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", 608 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
609 current->comm, current->pid, frame, regs->rip, frame->pretcode); 609 current->comm, current->pid, frame, regs->ip, frame->pretcode);
610#endif 610#endif
611 611
612 return 0; 612 return 0;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index df588f0f76e1..0db0a6291bbd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,7 +12,6 @@
12#include <asm/ia32_unistd.h> 12#include <asm/ia32_unistd.h>
13#include <asm/thread_info.h> 13#include <asm/thread_info.h>
14#include <asm/segment.h> 14#include <asm/segment.h>
15#include <asm/vsyscall32.h>
16#include <asm/irqflags.h> 15#include <asm/irqflags.h>
17#include <linux/linkage.h> 16#include <linux/linkage.h>
18 17
@@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target)
104 pushfq 103 pushfq
105 CFI_ADJUST_CFA_OFFSET 8 104 CFI_ADJUST_CFA_OFFSET 8
106 /*CFI_REL_OFFSET rflags,0*/ 105 /*CFI_REL_OFFSET rflags,0*/
107 movl $VSYSCALL32_SYSEXIT, %r10d 106 movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d
108 CFI_REGISTER rip,r10 107 CFI_REGISTER rip,r10
109 pushq $__USER32_CS 108 pushq $__USER32_CS
110 CFI_ADJUST_CFA_OFFSET 8 109 CFI_ADJUST_CFA_OFFSET 8
@@ -142,6 +141,8 @@ sysenter_do_call:
142 andl $~TS_COMPAT,threadinfo_status(%r10) 141 andl $~TS_COMPAT,threadinfo_status(%r10)
143 /* clear IF, that popfq doesn't enable interrupts early */ 142 /* clear IF, that popfq doesn't enable interrupts early */
144 andl $~0x200,EFLAGS-R11(%rsp) 143 andl $~0x200,EFLAGS-R11(%rsp)
144 movl RIP-R11(%rsp),%edx /* User %eip */
145 CFI_REGISTER rip,rdx
145 RESTORE_ARGS 1,24,1,1,1,1 146 RESTORE_ARGS 1,24,1,1,1,1
146 popfq 147 popfq
147 CFI_ADJUST_CFA_OFFSET -8 148 CFI_ADJUST_CFA_OFFSET -8
@@ -149,8 +150,6 @@ sysenter_do_call:
149 popq %rcx /* User %esp */ 150 popq %rcx /* User %esp */
150 CFI_ADJUST_CFA_OFFSET -8 151 CFI_ADJUST_CFA_OFFSET -8
151 CFI_REGISTER rsp,rcx 152 CFI_REGISTER rsp,rcx
152 movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
153 CFI_REGISTER rip,rdx
154 TRACE_IRQS_ON 153 TRACE_IRQS_ON
155 swapgs 154 swapgs
156 sti /* sti only takes effect after the next instruction */ 155 sti /* sti only takes effect after the next instruction */
@@ -644,8 +643,8 @@ ia32_sys_call_table:
644 .quad compat_sys_futex /* 240 */ 643 .quad compat_sys_futex /* 240 */
645 .quad compat_sys_sched_setaffinity 644 .quad compat_sys_sched_setaffinity
646 .quad compat_sys_sched_getaffinity 645 .quad compat_sys_sched_getaffinity
647 .quad sys32_set_thread_area 646 .quad sys_set_thread_area
648 .quad sys32_get_thread_area 647 .quad sys_get_thread_area
649 .quad compat_sys_io_setup /* 245 */ 648 .quad compat_sys_io_setup /* 245 */
650 .quad sys_io_destroy 649 .quad sys_io_destroy
651 .quad compat_sys_io_getevents 650 .quad compat_sys_io_getevents
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index 7b3342e5aab5..d21991ce606c 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -9,9 +9,8 @@
9#include <linux/ipc.h> 9#include <linux/ipc.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11 11
12asmlinkage long 12asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
13sys32_ipc(u32 call, int first, int second, int third, 13 compat_uptr_t ptr, u32 fifth)
14 compat_uptr_t ptr, u32 fifth)
15{ 14{
16 int version; 15 int version;
17 16
@@ -19,36 +18,35 @@ sys32_ipc(u32 call, int first, int second, int third,
19 call &= 0xffff; 18 call &= 0xffff;
20 19
21 switch (call) { 20 switch (call) {
22 case SEMOP: 21 case SEMOP:
23 /* struct sembuf is the same on 32 and 64bit :)) */ 22 /* struct sembuf is the same on 32 and 64bit :)) */
24 return sys_semtimedop(first, compat_ptr(ptr), second, NULL); 23 return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
25 case SEMTIMEDOP: 24 case SEMTIMEDOP:
26 return compat_sys_semtimedop(first, compat_ptr(ptr), second, 25 return compat_sys_semtimedop(first, compat_ptr(ptr), second,
27 compat_ptr(fifth)); 26 compat_ptr(fifth));
28 case SEMGET: 27 case SEMGET:
29 return sys_semget(first, second, third); 28 return sys_semget(first, second, third);
30 case SEMCTL: 29 case SEMCTL:
31 return compat_sys_semctl(first, second, third, compat_ptr(ptr)); 30 return compat_sys_semctl(first, second, third, compat_ptr(ptr));
32 31
33 case MSGSND: 32 case MSGSND:
34 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); 33 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
35 case MSGRCV: 34 case MSGRCV:
36 return compat_sys_msgrcv(first, second, fifth, third, 35 return compat_sys_msgrcv(first, second, fifth, third,
37 version, compat_ptr(ptr)); 36 version, compat_ptr(ptr));
38 case MSGGET: 37 case MSGGET:
39 return sys_msgget((key_t) first, second); 38 return sys_msgget((key_t) first, second);
40 case MSGCTL: 39 case MSGCTL:
41 return compat_sys_msgctl(first, second, compat_ptr(ptr)); 40 return compat_sys_msgctl(first, second, compat_ptr(ptr));
42 41
43 case SHMAT: 42 case SHMAT:
44 return compat_sys_shmat(first, second, third, version, 43 return compat_sys_shmat(first, second, third, version,
45 compat_ptr(ptr)); 44 compat_ptr(ptr));
46 break; 45 case SHMDT:
47 case SHMDT:
48 return sys_shmdt(compat_ptr(ptr)); 46 return sys_shmdt(compat_ptr(ptr));
49 case SHMGET: 47 case SHMGET:
50 return sys_shmget(first, (unsigned)second, third); 48 return sys_shmget(first, (unsigned)second, third);
51 case SHMCTL: 49 case SHMCTL:
52 return compat_sys_shmctl(first, second, compat_ptr(ptr)); 50 return compat_sys_shmctl(first, second, compat_ptr(ptr));
53 } 51 }
54 return -ENOSYS; 52 return -ENOSYS;
diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c
deleted file mode 100644
index e4b84b4a417a..000000000000
--- a/arch/x86/ia32/mmap32.c
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * linux/arch/x86_64/ia32/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Based on the i386 version which was
7 *
8 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
9 * All Rights Reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 *
26 * Started by Ingo Molnar <mingo@elte.hu>
27 */
28
29#include <linux/personality.h>
30#include <linux/mm.h>
31#include <linux/random.h>
32#include <linux/sched.h>
33
34/*
35 * Top of mmap area (just below the process stack).
36 *
37 * Leave an at least ~128 MB hole.
38 */
39#define MIN_GAP (128*1024*1024)
40#define MAX_GAP (TASK_SIZE/6*5)
41
42static inline unsigned long mmap_base(struct mm_struct *mm)
43{
44 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
45 unsigned long random_factor = 0;
46
47 if (current->flags & PF_RANDOMIZE)
48 random_factor = get_random_int() % (1024*1024);
49
50 if (gap < MIN_GAP)
51 gap = MIN_GAP;
52 else if (gap > MAX_GAP)
53 gap = MAX_GAP;
54
55 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
56}
57
58/*
59 * This function, called very early during the creation of a new
60 * process VM image, sets up which VM layout function to use:
61 */
62void ia32_pick_mmap_layout(struct mm_struct *mm)
63{
64 /*
65 * Fall back to the standard layout if the personality
66 * bit is set, or if the expected stack growth is unlimited:
67 */
68 if (sysctl_legacy_va_layout ||
69 (current->personality & ADDR_COMPAT_LAYOUT) ||
70 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
71 mm->mmap_base = TASK_UNMAPPED_BASE;
72 mm->get_unmapped_area = arch_get_unmapped_area;
73 mm->unmap_area = arch_unmap_area;
74 } else {
75 mm->mmap_base = mmap_base(mm);
76 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
77 mm->unmap_area = arch_unmap_area_topdown;
78 }
79}
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
deleted file mode 100644
index 4a233ad6269c..000000000000
--- a/arch/x86/ia32/ptrace32.c
+++ /dev/null
@@ -1,404 +0,0 @@
1/*
2 * 32bit ptrace for x86-64.
3 *
4 * Copyright 2001,2002 Andi Kleen, SuSE Labs.
5 * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
6 * copyright.
7 *
8 * This allows to access 64bit processes too; but there is no way to see the extended
9 * register contents.
10 */
11
12#include <linux/kernel.h>
13#include <linux/stddef.h>
14#include <linux/sched.h>
15#include <linux/syscalls.h>
16#include <linux/unistd.h>
17#include <linux/mm.h>
18#include <linux/err.h>
19#include <linux/ptrace.h>
20#include <asm/ptrace.h>
21#include <asm/compat.h>
22#include <asm/uaccess.h>
23#include <asm/user32.h>
24#include <asm/user.h>
25#include <asm/errno.h>
26#include <asm/debugreg.h>
27#include <asm/i387.h>
28#include <asm/fpu32.h>
29#include <asm/ia32.h>
30
31/*
32 * Determines which flags the user has access to [1 = access, 0 = no access].
33 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
34 * Also masks reserved bits (31-22, 15, 5, 3, 1).
35 */
36#define FLAG_MASK 0x54dd5UL
37
38#define R32(l,q) \
39 case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
40
41static int putreg32(struct task_struct *child, unsigned regno, u32 val)
42{
43 int i;
44 __u64 *stack = (__u64 *)task_pt_regs(child);
45
46 switch (regno) {
47 case offsetof(struct user32, regs.fs):
48 if (val && (val & 3) != 3) return -EIO;
49 child->thread.fsindex = val & 0xffff;
50 break;
51 case offsetof(struct user32, regs.gs):
52 if (val && (val & 3) != 3) return -EIO;
53 child->thread.gsindex = val & 0xffff;
54 break;
55 case offsetof(struct user32, regs.ds):
56 if (val && (val & 3) != 3) return -EIO;
57 child->thread.ds = val & 0xffff;
58 break;
59 case offsetof(struct user32, regs.es):
60 child->thread.es = val & 0xffff;
61 break;
62 case offsetof(struct user32, regs.ss):
63 if ((val & 3) != 3) return -EIO;
64 stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
65 break;
66 case offsetof(struct user32, regs.cs):
67 if ((val & 3) != 3) return -EIO;
68 stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
69 break;
70
71 R32(ebx, rbx);
72 R32(ecx, rcx);
73 R32(edx, rdx);
74 R32(edi, rdi);
75 R32(esi, rsi);
76 R32(ebp, rbp);
77 R32(eax, rax);
78 R32(orig_eax, orig_rax);
79 R32(eip, rip);
80 R32(esp, rsp);
81
82 case offsetof(struct user32, regs.eflags): {
83 __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
84 val &= FLAG_MASK;
85 *flags = val | (*flags & ~FLAG_MASK);
86 break;
87 }
88
89 case offsetof(struct user32, u_debugreg[4]):
90 case offsetof(struct user32, u_debugreg[5]):
91 return -EIO;
92
93 case offsetof(struct user32, u_debugreg[0]):
94 child->thread.debugreg0 = val;
95 break;
96
97 case offsetof(struct user32, u_debugreg[1]):
98 child->thread.debugreg1 = val;
99 break;
100
101 case offsetof(struct user32, u_debugreg[2]):
102 child->thread.debugreg2 = val;
103 break;
104
105 case offsetof(struct user32, u_debugreg[3]):
106 child->thread.debugreg3 = val;
107 break;
108
109 case offsetof(struct user32, u_debugreg[6]):
110 child->thread.debugreg6 = val;
111 break;
112
113 case offsetof(struct user32, u_debugreg[7]):
114 val &= ~DR_CONTROL_RESERVED;
115 /* See arch/i386/kernel/ptrace.c for an explanation of
116 * this awkward check.*/
117 for(i=0; i<4; i++)
118 if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
119 return -EIO;
120 child->thread.debugreg7 = val;
121 if (val)
122 set_tsk_thread_flag(child, TIF_DEBUG);
123 else
124 clear_tsk_thread_flag(child, TIF_DEBUG);
125 break;
126
127 default:
128 if (regno > sizeof(struct user32) || (regno & 3))
129 return -EIO;
130
131 /* Other dummy fields in the virtual user structure are ignored */
132 break;
133 }
134 return 0;
135}
136
137#undef R32
138
139#define R32(l,q) \
140 case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
141
142static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
143{
144 __u64 *stack = (__u64 *)task_pt_regs(child);
145
146 switch (regno) {
147 case offsetof(struct user32, regs.fs):
148 *val = child->thread.fsindex;
149 break;
150 case offsetof(struct user32, regs.gs):
151 *val = child->thread.gsindex;
152 break;
153 case offsetof(struct user32, regs.ds):
154 *val = child->thread.ds;
155 break;
156 case offsetof(struct user32, regs.es):
157 *val = child->thread.es;
158 break;
159
160 R32(cs, cs);
161 R32(ss, ss);
162 R32(ebx, rbx);
163 R32(ecx, rcx);
164 R32(edx, rdx);
165 R32(edi, rdi);
166 R32(esi, rsi);
167 R32(ebp, rbp);
168 R32(eax, rax);
169 R32(orig_eax, orig_rax);
170 R32(eip, rip);
171 R32(eflags, eflags);
172 R32(esp, rsp);
173
174 case offsetof(struct user32, u_debugreg[0]):
175 *val = child->thread.debugreg0;
176 break;
177 case offsetof(struct user32, u_debugreg[1]):
178 *val = child->thread.debugreg1;
179 break;
180 case offsetof(struct user32, u_debugreg[2]):
181 *val = child->thread.debugreg2;
182 break;
183 case offsetof(struct user32, u_debugreg[3]):
184 *val = child->thread.debugreg3;
185 break;
186 case offsetof(struct user32, u_debugreg[6]):
187 *val = child->thread.debugreg6;
188 break;
189 case offsetof(struct user32, u_debugreg[7]):
190 *val = child->thread.debugreg7;
191 break;
192
193 default:
194 if (regno > sizeof(struct user32) || (regno & 3))
195 return -EIO;
196
197 /* Other dummy fields in the virtual user structure are ignored */
198 *val = 0;
199 break;
200 }
201 return 0;
202}
203
204#undef R32
205
206static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
207{
208 int ret;
209 compat_siginfo_t __user *si32 = compat_ptr(data);
210 siginfo_t ssi;
211 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
212 if (request == PTRACE_SETSIGINFO) {
213 memset(&ssi, 0, sizeof(siginfo_t));
214 ret = copy_siginfo_from_user32(&ssi, si32);
215 if (ret)
216 return ret;
217 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
218 return -EFAULT;
219 }
220 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
221 if (ret)
222 return ret;
223 if (request == PTRACE_GETSIGINFO) {
224 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
225 return -EFAULT;
226 ret = copy_siginfo_to_user32(si32, &ssi);
227 }
228 return ret;
229}
230
231asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
232{
233 struct task_struct *child;
234 struct pt_regs *childregs;
235 void __user *datap = compat_ptr(data);
236 int ret;
237 __u32 val;
238
239 switch (request) {
240 case PTRACE_TRACEME:
241 case PTRACE_ATTACH:
242 case PTRACE_KILL:
243 case PTRACE_CONT:
244 case PTRACE_SINGLESTEP:
245 case PTRACE_DETACH:
246 case PTRACE_SYSCALL:
247 case PTRACE_OLDSETOPTIONS:
248 case PTRACE_SETOPTIONS:
249 case PTRACE_SET_THREAD_AREA:
250 case PTRACE_GET_THREAD_AREA:
251 return sys_ptrace(request, pid, addr, data);
252
253 default:
254 return -EINVAL;
255
256 case PTRACE_PEEKTEXT:
257 case PTRACE_PEEKDATA:
258 case PTRACE_POKEDATA:
259 case PTRACE_POKETEXT:
260 case PTRACE_POKEUSR:
261 case PTRACE_PEEKUSR:
262 case PTRACE_GETREGS:
263 case PTRACE_SETREGS:
264 case PTRACE_SETFPREGS:
265 case PTRACE_GETFPREGS:
266 case PTRACE_SETFPXREGS:
267 case PTRACE_GETFPXREGS:
268 case PTRACE_GETEVENTMSG:
269 break;
270
271 case PTRACE_SETSIGINFO:
272 case PTRACE_GETSIGINFO:
273 return ptrace32_siginfo(request, pid, addr, data);
274 }
275
276 child = ptrace_get_task_struct(pid);
277 if (IS_ERR(child))
278 return PTR_ERR(child);
279
280 ret = ptrace_check_attach(child, request == PTRACE_KILL);
281 if (ret < 0)
282 goto out;
283
284 childregs = task_pt_regs(child);
285
286 switch (request) {
287 case PTRACE_PEEKDATA:
288 case PTRACE_PEEKTEXT:
289 ret = 0;
290 if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
291 ret = -EIO;
292 else
293 ret = put_user(val, (unsigned int __user *)datap);
294 break;
295
296 case PTRACE_POKEDATA:
297 case PTRACE_POKETEXT:
298 ret = 0;
299 if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
300 ret = -EIO;
301 break;
302
303 case PTRACE_PEEKUSR:
304 ret = getreg32(child, addr, &val);
305 if (ret == 0)
306 ret = put_user(val, (__u32 __user *)datap);
307 break;
308
309 case PTRACE_POKEUSR:
310 ret = putreg32(child, addr, data);
311 break;
312
313 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
314 int i;
315 if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
316 ret = -EIO;
317 break;
318 }
319 ret = 0;
320 for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
321 getreg32(child, i, &val);
322 ret |= __put_user(val,(u32 __user *)datap);
323 datap += sizeof(u32);
324 }
325 break;
326 }
327
328 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
329 unsigned long tmp;
330 int i;
331 if (!access_ok(VERIFY_READ, datap, 16*4)) {
332 ret = -EIO;
333 break;
334 }
335 ret = 0;
336 for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
337 ret |= __get_user(tmp, (u32 __user *)datap);
338 putreg32(child, i, tmp);
339 datap += sizeof(u32);
340 }
341 break;
342 }
343
344 case PTRACE_GETFPREGS:
345 ret = -EIO;
346 if (!access_ok(VERIFY_READ, compat_ptr(data),
347 sizeof(struct user_i387_struct)))
348 break;
349 save_i387_ia32(child, datap, childregs, 1);
350 ret = 0;
351 break;
352
353 case PTRACE_SETFPREGS:
354 ret = -EIO;
355 if (!access_ok(VERIFY_WRITE, datap,
356 sizeof(struct user_i387_struct)))
357 break;
358 ret = 0;
359 /* don't check EFAULT to be bug-to-bug compatible to i386 */
360 restore_i387_ia32(child, datap, 1);
361 break;
362
363 case PTRACE_GETFPXREGS: {
364 struct user32_fxsr_struct __user *u = datap;
365 init_fpu(child);
366 ret = -EIO;
367 if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
368 break;
369 ret = -EFAULT;
370 if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
371 break;
372 ret = __put_user(childregs->cs, &u->fcs);
373 ret |= __put_user(child->thread.ds, &u->fos);
374 break;
375 }
376 case PTRACE_SETFPXREGS: {
377 struct user32_fxsr_struct __user *u = datap;
378 unlazy_fpu(child);
379 ret = -EIO;
380 if (!access_ok(VERIFY_READ, u, sizeof(*u)))
381 break;
382 /* no checking to be bug-to-bug compatible with i386. */
383 /* but silence warning */
384 if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
385 ;
386 set_stopped_child_used_math(child);
387 child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
388 ret = 0;
389 break;
390 }
391
392 case PTRACE_GETEVENTMSG:
393 ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
394 break;
395
396 default:
397 BUG();
398 }
399
400 out:
401 put_task_struct(child);
402 return ret;
403}
404
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index bee96d614432..abf71d26fc2a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -1,29 +1,29 @@
1/* 1/*
2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on 2 * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
3 * sys_sparc32 3 * sys_sparc32
4 * 4 *
5 * Copyright (C) 2000 VA Linux Co 5 * Copyright (C) 2000 VA Linux Co
6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> 6 * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> 7 * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 8 * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 9 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
10 * Copyright (C) 2000 Hewlett-Packard Co. 10 * Copyright (C) 2000 Hewlett-Packard Co.
11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> 11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) 12 * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
13 * 13 *
14 * These routines maintain argument size conversion between 32bit and 64bit 14 * These routines maintain argument size conversion between 32bit and 64bit
15 * environment. In 2.5 most of this should be moved to a generic directory. 15 * environment. In 2.5 most of this should be moved to a generic directory.
16 * 16 *
17 * This file assumes that there is a hole at the end of user address space. 17 * This file assumes that there is a hole at the end of user address space.
18 * 18 *
19 * Some of the functions are LE specific currently. These are hopefully all marked. 19 * Some of the functions are LE specific currently. These are
20 * This should be fixed. 20 * hopefully all marked. This should be fixed.
21 */ 21 */
22 22
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/signal.h> 27#include <linux/signal.h>
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/resource.h> 29#include <linux/resource.h>
@@ -90,43 +90,44 @@ int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
90 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) 90 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
91 return -EOVERFLOW; 91 return -EOVERFLOW;
92 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || 92 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
93 __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || 93 __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
94 __put_user (ino, &ubuf->st_ino) || 94 __put_user(ino, &ubuf->st_ino) ||
95 __put_user (kbuf->mode, &ubuf->st_mode) || 95 __put_user(kbuf->mode, &ubuf->st_mode) ||
96 __put_user (kbuf->nlink, &ubuf->st_nlink) || 96 __put_user(kbuf->nlink, &ubuf->st_nlink) ||
97 __put_user (uid, &ubuf->st_uid) || 97 __put_user(uid, &ubuf->st_uid) ||
98 __put_user (gid, &ubuf->st_gid) || 98 __put_user(gid, &ubuf->st_gid) ||
99 __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || 99 __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
100 __put_user (kbuf->size, &ubuf->st_size) || 100 __put_user(kbuf->size, &ubuf->st_size) ||
101 __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || 101 __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
102 __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || 102 __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
103 __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || 103 __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
104 __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || 104 __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
105 __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || 105 __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
106 __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || 106 __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
107 __put_user (kbuf->blksize, &ubuf->st_blksize) || 107 __put_user(kbuf->blksize, &ubuf->st_blksize) ||
108 __put_user (kbuf->blocks, &ubuf->st_blocks)) 108 __put_user(kbuf->blocks, &ubuf->st_blocks))
109 return -EFAULT; 109 return -EFAULT;
110 return 0; 110 return 0;
111} 111}
112 112
113asmlinkage long 113asmlinkage long sys32_truncate64(char __user *filename,
114sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) 114 unsigned long offset_low,
115 unsigned long offset_high)
115{ 116{
116 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); 117 return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
117} 118}
118 119
119asmlinkage long 120asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low,
120sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) 121 unsigned long offset_high)
121{ 122{
122 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); 123 return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
123} 124}
124 125
125/* Another set for IA32/LFS -- x86_64 struct stat is different due to 126/*
126 support for 64bit inode numbers. */ 127 * Another set for IA32/LFS -- x86_64 struct stat is different due to
127 128 * support for 64bit inode numbers.
128static int 129 */
129cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) 130static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
130{ 131{
131 typeof(ubuf->st_uid) uid = 0; 132 typeof(ubuf->st_uid) uid = 0;
132 typeof(ubuf->st_gid) gid = 0; 133 typeof(ubuf->st_gid) gid = 0;
@@ -134,38 +135,39 @@ cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
134 SET_GID(gid, stat->gid); 135 SET_GID(gid, stat->gid);
135 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || 136 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
136 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || 137 __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
137 __put_user (stat->ino, &ubuf->__st_ino) || 138 __put_user(stat->ino, &ubuf->__st_ino) ||
138 __put_user (stat->ino, &ubuf->st_ino) || 139 __put_user(stat->ino, &ubuf->st_ino) ||
139 __put_user (stat->mode, &ubuf->st_mode) || 140 __put_user(stat->mode, &ubuf->st_mode) ||
140 __put_user (stat->nlink, &ubuf->st_nlink) || 141 __put_user(stat->nlink, &ubuf->st_nlink) ||
141 __put_user (uid, &ubuf->st_uid) || 142 __put_user(uid, &ubuf->st_uid) ||
142 __put_user (gid, &ubuf->st_gid) || 143 __put_user(gid, &ubuf->st_gid) ||
143 __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || 144 __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
144 __put_user (stat->size, &ubuf->st_size) || 145 __put_user(stat->size, &ubuf->st_size) ||
145 __put_user (stat->atime.tv_sec, &ubuf->st_atime) || 146 __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
146 __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || 147 __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
147 __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || 148 __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
148 __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || 149 __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
149 __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || 150 __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
150 __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || 151 __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
151 __put_user (stat->blksize, &ubuf->st_blksize) || 152 __put_user(stat->blksize, &ubuf->st_blksize) ||
152 __put_user (stat->blocks, &ubuf->st_blocks)) 153 __put_user(stat->blocks, &ubuf->st_blocks))
153 return -EFAULT; 154 return -EFAULT;
154 return 0; 155 return 0;
155} 156}
156 157
157asmlinkage long 158asmlinkage long sys32_stat64(char __user *filename,
158sys32_stat64(char __user * filename, struct stat64 __user *statbuf) 159 struct stat64 __user *statbuf)
159{ 160{
160 struct kstat stat; 161 struct kstat stat;
161 int ret = vfs_stat(filename, &stat); 162 int ret = vfs_stat(filename, &stat);
163
162 if (!ret) 164 if (!ret)
163 ret = cp_stat64(statbuf, &stat); 165 ret = cp_stat64(statbuf, &stat);
164 return ret; 166 return ret;
165} 167}
166 168
167asmlinkage long 169asmlinkage long sys32_lstat64(char __user *filename,
168sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) 170 struct stat64 __user *statbuf)
169{ 171{
170 struct kstat stat; 172 struct kstat stat;
171 int ret = vfs_lstat(filename, &stat); 173 int ret = vfs_lstat(filename, &stat);
@@ -174,8 +176,7 @@ sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
174 return ret; 176 return ret;
175} 177}
176 178
177asmlinkage long 179asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
178sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
179{ 180{
180 struct kstat stat; 181 struct kstat stat;
181 int ret = vfs_fstat(fd, &stat); 182 int ret = vfs_fstat(fd, &stat);
@@ -184,9 +185,8 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
184 return ret; 185 return ret;
185} 186}
186 187
187asmlinkage long 188asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
188sys32_fstatat(unsigned int dfd, char __user *filename, 189 struct stat64 __user *statbuf, int flag)
189 struct stat64 __user* statbuf, int flag)
190{ 190{
191 struct kstat stat; 191 struct kstat stat;
192 int error = -EINVAL; 192 int error = -EINVAL;
@@ -221,8 +221,7 @@ struct mmap_arg_struct {
221 unsigned int offset; 221 unsigned int offset;
222}; 222};
223 223
224asmlinkage long 224asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
225sys32_mmap(struct mmap_arg_struct __user *arg)
226{ 225{
227 struct mmap_arg_struct a; 226 struct mmap_arg_struct a;
228 struct file *file = NULL; 227 struct file *file = NULL;
@@ -233,33 +232,33 @@ sys32_mmap(struct mmap_arg_struct __user *arg)
233 return -EFAULT; 232 return -EFAULT;
234 233
235 if (a.offset & ~PAGE_MASK) 234 if (a.offset & ~PAGE_MASK)
236 return -EINVAL; 235 return -EINVAL;
237 236
238 if (!(a.flags & MAP_ANONYMOUS)) { 237 if (!(a.flags & MAP_ANONYMOUS)) {
239 file = fget(a.fd); 238 file = fget(a.fd);
240 if (!file) 239 if (!file)
241 return -EBADF; 240 return -EBADF;
242 } 241 }
243 242
244 mm = current->mm; 243 mm = current->mm;
245 down_write(&mm->mmap_sem); 244 down_write(&mm->mmap_sem);
246 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); 245 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
246 a.offset>>PAGE_SHIFT);
247 if (file) 247 if (file)
248 fput(file); 248 fput(file);
249 249
250 up_write(&mm->mmap_sem); 250 up_write(&mm->mmap_sem);
251 251
252 return retval; 252 return retval;
253} 253}
254 254
255asmlinkage long 255asmlinkage long sys32_mprotect(unsigned long start, size_t len,
256sys32_mprotect(unsigned long start, size_t len, unsigned long prot) 256 unsigned long prot)
257{ 257{
258 return sys_mprotect(start,len,prot); 258 return sys_mprotect(start, len, prot);
259} 259}
260 260
261asmlinkage long 261asmlinkage long sys32_pipe(int __user *fd)
262sys32_pipe(int __user *fd)
263{ 262{
264 int retval; 263 int retval;
265 int fds[2]; 264 int fds[2];
@@ -269,13 +268,13 @@ sys32_pipe(int __user *fd)
269 goto out; 268 goto out;
270 if (copy_to_user(fd, fds, sizeof(fds))) 269 if (copy_to_user(fd, fds, sizeof(fds)))
271 retval = -EFAULT; 270 retval = -EFAULT;
272 out: 271out:
273 return retval; 272 return retval;
274} 273}
275 274
276asmlinkage long 275asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
277sys32_rt_sigaction(int sig, struct sigaction32 __user *act, 276 struct sigaction32 __user *oact,
278 struct sigaction32 __user *oact, unsigned int sigsetsize) 277 unsigned int sigsetsize)
279{ 278{
280 struct k_sigaction new_ka, old_ka; 279 struct k_sigaction new_ka, old_ka;
281 int ret; 280 int ret;
@@ -291,12 +290,17 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
291 if (!access_ok(VERIFY_READ, act, sizeof(*act)) || 290 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
292 __get_user(handler, &act->sa_handler) || 291 __get_user(handler, &act->sa_handler) ||
293 __get_user(new_ka.sa.sa_flags, &act->sa_flags) || 292 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
294 __get_user(restorer, &act->sa_restorer)|| 293 __get_user(restorer, &act->sa_restorer) ||
295 __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) 294 __copy_from_user(&set32, &act->sa_mask,
295 sizeof(compat_sigset_t)))
296 return -EFAULT; 296 return -EFAULT;
297 new_ka.sa.sa_handler = compat_ptr(handler); 297 new_ka.sa.sa_handler = compat_ptr(handler);
298 new_ka.sa.sa_restorer = compat_ptr(restorer); 298 new_ka.sa.sa_restorer = compat_ptr(restorer);
299 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ 299
300 /*
301 * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
302 * than _NSIG_WORDS << 1
303 */
300 switch (_NSIG_WORDS) { 304 switch (_NSIG_WORDS) {
301 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] 305 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
302 | (((long)set32.sig[7]) << 32); 306 | (((long)set32.sig[7]) << 32);
@@ -312,7 +316,10 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
312 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 316 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
313 317
314 if (!ret && oact) { 318 if (!ret && oact) {
315 /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ 319 /*
320 * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
321 * than _NSIG_WORDS << 1
322 */
316 switch (_NSIG_WORDS) { 323 switch (_NSIG_WORDS) {
317 case 4: 324 case 4:
318 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); 325 set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
@@ -328,23 +335,26 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
328 set32.sig[0] = old_ka.sa.sa_mask.sig[0]; 335 set32.sig[0] = old_ka.sa.sa_mask.sig[0];
329 } 336 }
330 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 337 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
331 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || 338 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
332 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || 339 &oact->sa_handler) ||
340 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
341 &oact->sa_restorer) ||
333 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || 342 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
334 __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) 343 __copy_to_user(&oact->sa_mask, &set32,
344 sizeof(compat_sigset_t)))
335 return -EFAULT; 345 return -EFAULT;
336 } 346 }
337 347
338 return ret; 348 return ret;
339} 349}
340 350
341asmlinkage long 351asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
342sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) 352 struct old_sigaction32 __user *oact)
343{ 353{
344 struct k_sigaction new_ka, old_ka; 354 struct k_sigaction new_ka, old_ka;
345 int ret; 355 int ret;
346 356
347 if (act) { 357 if (act) {
348 compat_old_sigset_t mask; 358 compat_old_sigset_t mask;
349 compat_uptr_t handler, restorer; 359 compat_uptr_t handler, restorer;
350 360
@@ -359,33 +369,35 @@ sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigacti
359 new_ka.sa.sa_restorer = compat_ptr(restorer); 369 new_ka.sa.sa_restorer = compat_ptr(restorer);
360 370
361 siginitset(&new_ka.sa.sa_mask, mask); 371 siginitset(&new_ka.sa.sa_mask, mask);
362 } 372 }
363 373
364 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 374 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
365 375
366 if (!ret && oact) { 376 if (!ret && oact) {
367 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 377 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
368 __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || 378 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
369 __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || 379 &oact->sa_handler) ||
380 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
381 &oact->sa_restorer) ||
370 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || 382 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
371 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) 383 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
372 return -EFAULT; 384 return -EFAULT;
373 } 385 }
374 386
375 return ret; 387 return ret;
376} 388}
377 389
378asmlinkage long 390asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
379sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, 391 compat_sigset_t __user *oset,
380 compat_sigset_t __user *oset, unsigned int sigsetsize) 392 unsigned int sigsetsize)
381{ 393{
382 sigset_t s; 394 sigset_t s;
383 compat_sigset_t s32; 395 compat_sigset_t s32;
384 int ret; 396 int ret;
385 mm_segment_t old_fs = get_fs(); 397 mm_segment_t old_fs = get_fs();
386 398
387 if (set) { 399 if (set) {
388 if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) 400 if (copy_from_user(&s32, set, sizeof(compat_sigset_t)))
389 return -EFAULT; 401 return -EFAULT;
390 switch (_NSIG_WORDS) { 402 switch (_NSIG_WORDS) {
391 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); 403 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
@@ -394,13 +406,14 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
394 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); 406 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
395 } 407 }
396 } 408 }
397 set_fs (KERNEL_DS); 409 set_fs(KERNEL_DS);
398 ret = sys_rt_sigprocmask(how, 410 ret = sys_rt_sigprocmask(how,
399 set ? (sigset_t __user *)&s : NULL, 411 set ? (sigset_t __user *)&s : NULL,
400 oset ? (sigset_t __user *)&s : NULL, 412 oset ? (sigset_t __user *)&s : NULL,
401 sigsetsize); 413 sigsetsize);
402 set_fs (old_fs); 414 set_fs(old_fs);
403 if (ret) return ret; 415 if (ret)
416 return ret;
404 if (oset) { 417 if (oset) {
405 switch (_NSIG_WORDS) { 418 switch (_NSIG_WORDS) {
406 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; 419 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -408,52 +421,49 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
408 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; 421 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
409 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; 422 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
410 } 423 }
411 if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) 424 if (copy_to_user(oset, &s32, sizeof(compat_sigset_t)))
412 return -EFAULT; 425 return -EFAULT;
413 } 426 }
414 return 0; 427 return 0;
415} 428}
416 429
417static inline long 430static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
418get_tv32(struct timeval *o, struct compat_timeval __user *i)
419{ 431{
420 int err = -EFAULT; 432 int err = -EFAULT;
421 if (access_ok(VERIFY_READ, i, sizeof(*i))) { 433
434 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
422 err = __get_user(o->tv_sec, &i->tv_sec); 435 err = __get_user(o->tv_sec, &i->tv_sec);
423 err |= __get_user(o->tv_usec, &i->tv_usec); 436 err |= __get_user(o->tv_usec, &i->tv_usec);
424 } 437 }
425 return err; 438 return err;
426} 439}
427 440
428static inline long 441static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
429put_tv32(struct compat_timeval __user *o, struct timeval *i)
430{ 442{
431 int err = -EFAULT; 443 int err = -EFAULT;
432 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { 444
445 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
433 err = __put_user(i->tv_sec, &o->tv_sec); 446 err = __put_user(i->tv_sec, &o->tv_sec);
434 err |= __put_user(i->tv_usec, &o->tv_usec); 447 err |= __put_user(i->tv_usec, &o->tv_usec);
435 } 448 }
436 return err; 449 return err;
437} 450}
438 451
439extern unsigned int alarm_setitimer(unsigned int seconds); 452asmlinkage long sys32_alarm(unsigned int seconds)
440
441asmlinkage long
442sys32_alarm(unsigned int seconds)
443{ 453{
444 return alarm_setitimer(seconds); 454 return alarm_setitimer(seconds);
445} 455}
446 456
447/* Translations due to time_t size differences. Which affects all 457/*
448 sorts of things, like timeval and itimerval. */ 458 * Translations due to time_t size differences. Which affects all
449 459 * sorts of things, like timeval and itimerval.
450extern struct timezone sys_tz; 460 */
451 461asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
452asmlinkage long 462 struct timezone __user *tz)
453sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
454{ 463{
455 if (tv) { 464 if (tv) {
456 struct timeval ktv; 465 struct timeval ktv;
466
457 do_gettimeofday(&ktv); 467 do_gettimeofday(&ktv);
458 if (put_tv32(tv, &ktv)) 468 if (put_tv32(tv, &ktv))
459 return -EFAULT; 469 return -EFAULT;
@@ -465,14 +475,14 @@ sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
465 return 0; 475 return 0;
466} 476}
467 477
468asmlinkage long 478asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
469sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) 479 struct timezone __user *tz)
470{ 480{
471 struct timeval ktv; 481 struct timeval ktv;
472 struct timespec kts; 482 struct timespec kts;
473 struct timezone ktz; 483 struct timezone ktz;
474 484
475 if (tv) { 485 if (tv) {
476 if (get_tv32(&ktv, tv)) 486 if (get_tv32(&ktv, tv))
477 return -EFAULT; 487 return -EFAULT;
478 kts.tv_sec = ktv.tv_sec; 488 kts.tv_sec = ktv.tv_sec;
@@ -494,8 +504,7 @@ struct sel_arg_struct {
494 unsigned int tvp; 504 unsigned int tvp;
495}; 505};
496 506
497asmlinkage long 507asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
498sys32_old_select(struct sel_arg_struct __user *arg)
499{ 508{
500 struct sel_arg_struct a; 509 struct sel_arg_struct a;
501 510
@@ -505,50 +514,45 @@ sys32_old_select(struct sel_arg_struct __user *arg)
505 compat_ptr(a.exp), compat_ptr(a.tvp)); 514 compat_ptr(a.exp), compat_ptr(a.tvp));
506} 515}
507 516
508extern asmlinkage long 517asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
509compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, 518 int options)
510 struct compat_rusage *ru);
511
512asmlinkage long
513sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
514{ 519{
515 return compat_sys_wait4(pid, stat_addr, options, NULL); 520 return compat_sys_wait4(pid, stat_addr, options, NULL);
516} 521}
517 522
518/* 32-bit timeval and related flotsam. */ 523/* 32-bit timeval and related flotsam. */
519 524
520asmlinkage long 525asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
521sys32_sysfs(int option, u32 arg1, u32 arg2)
522{ 526{
523 return sys_sysfs(option, arg1, arg2); 527 return sys_sysfs(option, arg1, arg2);
524} 528}
525 529
526asmlinkage long 530asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
527sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) 531 struct compat_timespec __user *interval)
528{ 532{
529 struct timespec t; 533 struct timespec t;
530 int ret; 534 int ret;
531 mm_segment_t old_fs = get_fs (); 535 mm_segment_t old_fs = get_fs();
532 536
533 set_fs (KERNEL_DS); 537 set_fs(KERNEL_DS);
534 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); 538 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
535 set_fs (old_fs); 539 set_fs(old_fs);
536 if (put_compat_timespec(&t, interval)) 540 if (put_compat_timespec(&t, interval))
537 return -EFAULT; 541 return -EFAULT;
538 return ret; 542 return ret;
539} 543}
540 544
541asmlinkage long 545asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set,
542sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) 546 compat_size_t sigsetsize)
543{ 547{
544 sigset_t s; 548 sigset_t s;
545 compat_sigset_t s32; 549 compat_sigset_t s32;
546 int ret; 550 int ret;
547 mm_segment_t old_fs = get_fs(); 551 mm_segment_t old_fs = get_fs();
548 552
549 set_fs (KERNEL_DS); 553 set_fs(KERNEL_DS);
550 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); 554 ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
551 set_fs (old_fs); 555 set_fs(old_fs);
552 if (!ret) { 556 if (!ret) {
553 switch (_NSIG_WORDS) { 557 switch (_NSIG_WORDS) {
554 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; 558 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -556,30 +560,29 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
556 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; 560 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
557 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; 561 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
558 } 562 }
559 if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) 563 if (copy_to_user(set, &s32, sizeof(compat_sigset_t)))
560 return -EFAULT; 564 return -EFAULT;
561 } 565 }
562 return ret; 566 return ret;
563} 567}
564 568
565asmlinkage long 569asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
566sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) 570 compat_siginfo_t __user *uinfo)
567{ 571{
568 siginfo_t info; 572 siginfo_t info;
569 int ret; 573 int ret;
570 mm_segment_t old_fs = get_fs(); 574 mm_segment_t old_fs = get_fs();
571 575
572 if (copy_siginfo_from_user32(&info, uinfo)) 576 if (copy_siginfo_from_user32(&info, uinfo))
573 return -EFAULT; 577 return -EFAULT;
574 set_fs (KERNEL_DS); 578 set_fs(KERNEL_DS);
575 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); 579 ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
576 set_fs (old_fs); 580 set_fs(old_fs);
577 return ret; 581 return ret;
578} 582}
579 583
580/* These are here just in case some old ia32 binary calls it. */ 584/* These are here just in case some old ia32 binary calls it. */
581asmlinkage long 585asmlinkage long sys32_pause(void)
582sys32_pause(void)
583{ 586{
584 current->state = TASK_INTERRUPTIBLE; 587 current->state = TASK_INTERRUPTIBLE;
585 schedule(); 588 schedule();
@@ -599,25 +602,25 @@ struct sysctl_ia32 {
599}; 602};
600 603
601 604
602asmlinkage long 605asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
603sys32_sysctl(struct sysctl_ia32 __user *args32)
604{ 606{
605 struct sysctl_ia32 a32; 607 struct sysctl_ia32 a32;
606 mm_segment_t old_fs = get_fs (); 608 mm_segment_t old_fs = get_fs();
607 void __user *oldvalp, *newvalp; 609 void __user *oldvalp, *newvalp;
608 size_t oldlen; 610 size_t oldlen;
609 int __user *namep; 611 int __user *namep;
610 long ret; 612 long ret;
611 613
612 if (copy_from_user(&a32, args32, sizeof (a32))) 614 if (copy_from_user(&a32, args32, sizeof(a32)))
613 return -EFAULT; 615 return -EFAULT;
614 616
615 /* 617 /*
616 * We need to pre-validate these because we have to disable address checking 618 * We need to pre-validate these because we have to disable
617 * before calling do_sysctl() because of OLDLEN but we can't run the risk of the 619 * address checking before calling do_sysctl() because of
618 * user specifying bad addresses here. Well, since we're dealing with 32 bit 620 * OLDLEN but we can't run the risk of the user specifying bad
619 * addresses, we KNOW that access_ok() will always succeed, so this is an 621 * addresses here. Well, since we're dealing with 32 bit
620 * expensive NOP, but so what... 622 * addresses, we KNOW that access_ok() will always succeed, so
623 * this is an expensive NOP, but so what...
621 */ 624 */
622 namep = compat_ptr(a32.name); 625 namep = compat_ptr(a32.name);
623 oldvalp = compat_ptr(a32.oldval); 626 oldvalp = compat_ptr(a32.oldval);
@@ -636,34 +639,34 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
636 unlock_kernel(); 639 unlock_kernel();
637 set_fs(old_fs); 640 set_fs(old_fs);
638 641
639 if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) 642 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
640 return -EFAULT; 643 return -EFAULT;
641 644
642 return ret; 645 return ret;
643} 646}
644#endif 647#endif
645 648
646/* warning: next two assume little endian */ 649/* warning: next two assume little endian */
647asmlinkage long 650asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
648sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) 651 u32 poslo, u32 poshi)
649{ 652{
650 return sys_pread64(fd, ubuf, count, 653 return sys_pread64(fd, ubuf, count,
651 ((loff_t)AA(poshi) << 32) | AA(poslo)); 654 ((loff_t)AA(poshi) << 32) | AA(poslo));
652} 655}
653 656
654asmlinkage long 657asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
655sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) 658 u32 poslo, u32 poshi)
656{ 659{
657 return sys_pwrite64(fd, ubuf, count, 660 return sys_pwrite64(fd, ubuf, count,
658 ((loff_t)AA(poshi) << 32) | AA(poslo)); 661 ((loff_t)AA(poshi) << 32) | AA(poslo));
659} 662}
660 663
661 664
662asmlinkage long 665asmlinkage long sys32_personality(unsigned long personality)
663sys32_personality(unsigned long personality)
664{ 666{
665 int ret; 667 int ret;
666 if (personality(current->personality) == PER_LINUX32 && 668
669 if (personality(current->personality) == PER_LINUX32 &&
667 personality == PER_LINUX) 670 personality == PER_LINUX)
668 personality = PER_LINUX32; 671 personality = PER_LINUX32;
669 ret = sys_personality(personality); 672 ret = sys_personality(personality);
@@ -672,34 +675,33 @@ sys32_personality(unsigned long personality)
672 return ret; 675 return ret;
673} 676}
674 677
675asmlinkage long 678asmlinkage long sys32_sendfile(int out_fd, int in_fd,
676sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) 679 compat_off_t __user *offset, s32 count)
677{ 680{
678 mm_segment_t old_fs = get_fs(); 681 mm_segment_t old_fs = get_fs();
679 int ret; 682 int ret;
680 off_t of; 683 off_t of;
681 684
682 if (offset && get_user(of, offset)) 685 if (offset && get_user(of, offset))
683 return -EFAULT; 686 return -EFAULT;
684 687
685 set_fs(KERNEL_DS); 688 set_fs(KERNEL_DS);
686 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, 689 ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
687 count); 690 count);
688 set_fs(old_fs); 691 set_fs(old_fs);
689 692
690 if (offset && put_user(of, offset)) 693 if (offset && put_user(of, offset))
691 return -EFAULT; 694 return -EFAULT;
692
693 return ret; 695 return ret;
694} 696}
695 697
696asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, 698asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
697 unsigned long prot, unsigned long flags, 699 unsigned long prot, unsigned long flags,
698 unsigned long fd, unsigned long pgoff) 700 unsigned long fd, unsigned long pgoff)
699{ 701{
700 struct mm_struct *mm = current->mm; 702 struct mm_struct *mm = current->mm;
701 unsigned long error; 703 unsigned long error;
702 struct file * file = NULL; 704 struct file *file = NULL;
703 705
704 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 706 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
705 if (!(flags & MAP_ANONYMOUS)) { 707 if (!(flags & MAP_ANONYMOUS)) {
@@ -717,36 +719,35 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
717 return error; 719 return error;
718} 720}
719 721
720asmlinkage long sys32_olduname(struct oldold_utsname __user * name) 722asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
721{ 723{
724 char *arch = "x86_64";
722 int err; 725 int err;
723 726
724 if (!name) 727 if (!name)
725 return -EFAULT; 728 return -EFAULT;
726 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) 729 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
727 return -EFAULT; 730 return -EFAULT;
728 731
729 down_read(&uts_sem); 732 down_read(&uts_sem);
730 733
731 err = __copy_to_user(&name->sysname,&utsname()->sysname, 734 err = __copy_to_user(&name->sysname, &utsname()->sysname,
732 __OLD_UTS_LEN); 735 __OLD_UTS_LEN);
733 err |= __put_user(0,name->sysname+__OLD_UTS_LEN); 736 err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
734 err |= __copy_to_user(&name->nodename,&utsname()->nodename, 737 err |= __copy_to_user(&name->nodename, &utsname()->nodename,
735 __OLD_UTS_LEN); 738 __OLD_UTS_LEN);
736 err |= __put_user(0,name->nodename+__OLD_UTS_LEN); 739 err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
737 err |= __copy_to_user(&name->release,&utsname()->release, 740 err |= __copy_to_user(&name->release, &utsname()->release,
738 __OLD_UTS_LEN); 741 __OLD_UTS_LEN);
739 err |= __put_user(0,name->release+__OLD_UTS_LEN); 742 err |= __put_user(0, name->release+__OLD_UTS_LEN);
740 err |= __copy_to_user(&name->version,&utsname()->version, 743 err |= __copy_to_user(&name->version, &utsname()->version,
741 __OLD_UTS_LEN); 744 __OLD_UTS_LEN);
742 err |= __put_user(0,name->version+__OLD_UTS_LEN); 745 err |= __put_user(0, name->version+__OLD_UTS_LEN);
743 { 746
744 char *arch = "x86_64"; 747 if (personality(current->personality) == PER_LINUX32)
745 if (personality(current->personality) == PER_LINUX32) 748 arch = "i686";
746 arch = "i686"; 749
747 750 err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
748 err |= __copy_to_user(&name->machine, arch, strlen(arch)+1);
749 }
750 751
751 up_read(&uts_sem); 752 up_read(&uts_sem);
752 753
@@ -755,17 +756,19 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
755 return err; 756 return err;
756} 757}
757 758
758long sys32_uname(struct old_utsname __user * name) 759long sys32_uname(struct old_utsname __user *name)
759{ 760{
760 int err; 761 int err;
762
761 if (!name) 763 if (!name)
762 return -EFAULT; 764 return -EFAULT;
763 down_read(&uts_sem); 765 down_read(&uts_sem);
764 err = copy_to_user(name, utsname(), sizeof (*name)); 766 err = copy_to_user(name, utsname(), sizeof(*name));
765 up_read(&uts_sem); 767 up_read(&uts_sem);
766 if (personality(current->personality) == PER_LINUX32) 768 if (personality(current->personality) == PER_LINUX32)
767 err |= copy_to_user(&name->machine, "i686", 5); 769 err |= copy_to_user(&name->machine, "i686", 5);
768 return err?-EFAULT:0; 770
771 return err ? -EFAULT : 0;
769} 772}
770 773
771long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) 774long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
@@ -773,27 +776,28 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
773 struct ustat u; 776 struct ustat u;
774 mm_segment_t seg; 777 mm_segment_t seg;
775 int ret; 778 int ret;
776 779
777 seg = get_fs(); 780 seg = get_fs();
778 set_fs(KERNEL_DS); 781 set_fs(KERNEL_DS);
779 ret = sys_ustat(dev, (struct ustat __user *)&u); 782 ret = sys_ustat(dev, (struct ustat __user *)&u);
780 set_fs(seg); 783 set_fs(seg);
781 if (ret >= 0) { 784 if (ret < 0)
782 if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 785 return ret;
783 __put_user((__u32) u.f_tfree, &u32p->f_tfree) || 786
784 __put_user((__u32) u.f_tinode, &u32p->f_tfree) || 787 if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
785 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || 788 __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
786 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) 789 __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
787 ret = -EFAULT; 790 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
788 } 791 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
792 ret = -EFAULT;
789 return ret; 793 return ret;
790} 794}
791 795
792asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 796asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
793 compat_uptr_t __user *envp, struct pt_regs *regs) 797 compat_uptr_t __user *envp, struct pt_regs *regs)
794{ 798{
795 long error; 799 long error;
796 char * filename; 800 char *filename;
797 801
798 filename = getname(name); 802 filename = getname(name);
799 error = PTR_ERR(filename); 803 error = PTR_ERR(filename);
@@ -812,18 +816,19 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
812asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, 816asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
813 struct pt_regs *regs) 817 struct pt_regs *regs)
814{ 818{
815 void __user *parent_tid = (void __user *)regs->rdx; 819 void __user *parent_tid = (void __user *)regs->dx;
816 void __user *child_tid = (void __user *)regs->rdi; 820 void __user *child_tid = (void __user *)regs->di;
821
817 if (!newsp) 822 if (!newsp)
818 newsp = regs->rsp; 823 newsp = regs->sp;
819 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 824 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
820} 825}
821 826
822/* 827/*
823 * Some system calls that need sign extended arguments. This could be done by a generic wrapper. 828 * Some system calls that need sign extended arguments. This could be
824 */ 829 * done by a generic wrapper.
825 830 */
826long sys32_lseek (unsigned int fd, int offset, unsigned int whence) 831long sys32_lseek(unsigned int fd, int offset, unsigned int whence)
827{ 832{
828 return sys_lseek(fd, offset, whence); 833 return sys_lseek(fd, offset, whence);
829} 834}
@@ -832,49 +837,52 @@ long sys32_kill(int pid, int sig)
832{ 837{
833 return sys_kill(pid, sig); 838 return sys_kill(pid, sig);
834} 839}
835 840
836long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 841long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
837 __u32 len_low, __u32 len_high, int advice) 842 __u32 len_low, __u32 len_high, int advice)
838{ 843{
839 return sys_fadvise64_64(fd, 844 return sys_fadvise64_64(fd,
840 (((u64)offset_high)<<32) | offset_low, 845 (((u64)offset_high)<<32) | offset_low,
841 (((u64)len_high)<<32) | len_low, 846 (((u64)len_high)<<32) | len_low,
842 advice); 847 advice);
843} 848}
844 849
845long sys32_vm86_warning(void) 850long sys32_vm86_warning(void)
846{ 851{
847 struct task_struct *me = current; 852 struct task_struct *me = current;
848 static char lastcomm[sizeof(me->comm)]; 853 static char lastcomm[sizeof(me->comm)];
854
849 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { 855 if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
850 compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", 856 compat_printk(KERN_INFO
851 me->comm); 857 "%s: vm86 mode not supported on 64 bit kernel\n",
858 me->comm);
852 strncpy(lastcomm, me->comm, sizeof(lastcomm)); 859 strncpy(lastcomm, me->comm, sizeof(lastcomm));
853 } 860 }
854 return -ENOSYS; 861 return -ENOSYS;
855} 862}
856 863
857long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, 864long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
858 char __user * buf, size_t len) 865 char __user *buf, size_t len)
859{ 866{
860 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); 867 return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
861} 868}
862 869
863asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) 870asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
871 size_t count)
864{ 872{
865 return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); 873 return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
866} 874}
867 875
868asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, 876asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
869 unsigned n_low, unsigned n_hi, int flags) 877 unsigned n_low, unsigned n_hi, int flags)
870{ 878{
871 return sys_sync_file_range(fd, 879 return sys_sync_file_range(fd,
872 ((u64)off_hi << 32) | off_low, 880 ((u64)off_hi << 32) | off_low,
873 ((u64)n_hi << 32) | n_low, flags); 881 ((u64)n_hi << 32) | n_low, flags);
874} 882}
875 883
876asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, 884asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
877 int advice) 885 size_t len, int advice)
878{ 886{
879 return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, 887 return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
880 len, advice); 888 len, advice);
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
deleted file mode 100644
index 15013bac181c..000000000000
--- a/arch/x86/ia32/syscall32.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
2
3/* vsyscall handling for 32bit processes. Map a stub page into it
4 on demand because 32bit cannot reach the kernel's fixmaps */
5
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/kernel.h>
9#include <linux/gfp.h>
10#include <linux/init.h>
11#include <linux/stringify.h>
12#include <linux/security.h>
13#include <asm/proto.h>
14#include <asm/tlbflush.h>
15#include <asm/ia32_unistd.h>
16#include <asm/vsyscall32.h>
17
18extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
19extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
20extern int sysctl_vsyscall32;
21
22static struct page *syscall32_pages[1];
23static int use_sysenter = -1;
24
25struct linux_binprm;
26
27/* Setup a VMA at program startup for the vsyscall page */
28int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
29{
30 struct mm_struct *mm = current->mm;
31 int ret;
32
33 down_write(&mm->mmap_sem);
34 /*
35 * MAYWRITE to allow gdb to COW and set breakpoints
36 *
37 * Make sure the vDSO gets into every core dump.
38 * Dumping its contents makes post-mortem fully interpretable later
39 * without matching up the same kernel and hardware config to see
40 * what PC values meant.
41 */
42 /* Could randomize here */
43 ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE,
44 VM_READ|VM_EXEC|
45 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
46 VM_ALWAYSDUMP,
47 syscall32_pages);
48 up_write(&mm->mmap_sem);
49 return ret;
50}
51
52static int __init init_syscall32(void)
53{
54 char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
55 if (!syscall32_page)
56 panic("Cannot allocate syscall32 page");
57 syscall32_pages[0] = virt_to_page(syscall32_page);
58 if (use_sysenter > 0) {
59 memcpy(syscall32_page, syscall32_sysenter,
60 syscall32_sysenter_end - syscall32_sysenter);
61 } else {
62 memcpy(syscall32_page, syscall32_syscall,
63 syscall32_syscall_end - syscall32_syscall);
64 }
65 return 0;
66}
67
68__initcall(init_syscall32);
69
70/* May not be __init: called during resume */
71void syscall32_cpu_init(void)
72{
73 if (use_sysenter < 0)
74 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
75
76 /* Load these always in case some future AMD CPU supports
77 SYSENTER from compat mode too. */
78 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
79 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
80 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
81
82 wrmsrl(MSR_CSTAR, ia32_cstar_target);
83}
diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S
deleted file mode 100644
index 933f0f08b1cf..000000000000
--- a/arch/x86/ia32/syscall32_syscall.S
+++ /dev/null
@@ -1,17 +0,0 @@
1/* 32bit VDSOs mapped into user space. */
2
3 .section ".init.data","aw"
4
5 .globl syscall32_syscall
6 .globl syscall32_syscall_end
7
8syscall32_syscall:
9 .incbin "arch/x86/ia32/vsyscall-syscall.so"
10syscall32_syscall_end:
11
12 .globl syscall32_sysenter
13 .globl syscall32_sysenter_end
14
15syscall32_sysenter:
16 .incbin "arch/x86/ia32/vsyscall-sysenter.so"
17syscall32_sysenter_end:
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
deleted file mode 100644
index 1cc4340de3ca..000000000000
--- a/arch/x86/ia32/tls32.c
+++ /dev/null
@@ -1,163 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5
6#include <asm/uaccess.h>
7#include <asm/desc.h>
8#include <asm/system.h>
9#include <asm/ldt.h>
10#include <asm/processor.h>
11#include <asm/proto.h>
12
13/*
14 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
15 */
16static int get_free_idx(void)
17{
18 struct thread_struct *t = &current->thread;
19 int idx;
20
21 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
22 if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
23 return idx + GDT_ENTRY_TLS_MIN;
24 return -ESRCH;
25}
26
27/*
28 * Set a given TLS descriptor:
29 * When you want addresses > 32bit use arch_prctl()
30 */
31int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
32{
33 struct user_desc info;
34 struct n_desc_struct *desc;
35 int cpu, idx;
36
37 if (copy_from_user(&info, u_info, sizeof(info)))
38 return -EFAULT;
39
40 idx = info.entry_number;
41
42 /*
43 * index -1 means the kernel should try to find and
44 * allocate an empty descriptor:
45 */
46 if (idx == -1) {
47 idx = get_free_idx();
48 if (idx < 0)
49 return idx;
50 if (put_user(idx, &u_info->entry_number))
51 return -EFAULT;
52 }
53
54 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
55 return -EINVAL;
56
57 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
58
59 /*
60 * We must not get preempted while modifying the TLS.
61 */
62 cpu = get_cpu();
63
64 if (LDT_empty(&info)) {
65 desc->a = 0;
66 desc->b = 0;
67 } else {
68 desc->a = LDT_entry_a(&info);
69 desc->b = LDT_entry_b(&info);
70 }
71 if (t == &current->thread)
72 load_TLS(t, cpu);
73
74 put_cpu();
75 return 0;
76}
77
78asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
79{
80 return do_set_thread_area(&current->thread, u_info);
81}
82
83
84/*
85 * Get the current Thread-Local Storage area:
86 */
87
88#define GET_BASE(desc) ( \
89 (((desc)->a >> 16) & 0x0000ffff) | \
90 (((desc)->b << 16) & 0x00ff0000) | \
91 ( (desc)->b & 0xff000000) )
92
93#define GET_LIMIT(desc) ( \
94 ((desc)->a & 0x0ffff) | \
95 ((desc)->b & 0xf0000) )
96
97#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
98#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
99#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
100#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
101#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
102#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
103#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
104
105int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
106{
107 struct user_desc info;
108 struct n_desc_struct *desc;
109 int idx;
110
111 if (get_user(idx, &u_info->entry_number))
112 return -EFAULT;
113 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
114 return -EINVAL;
115
116 desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
117
118 memset(&info, 0, sizeof(struct user_desc));
119 info.entry_number = idx;
120 info.base_addr = GET_BASE(desc);
121 info.limit = GET_LIMIT(desc);
122 info.seg_32bit = GET_32BIT(desc);
123 info.contents = GET_CONTENTS(desc);
124 info.read_exec_only = !GET_WRITABLE(desc);
125 info.limit_in_pages = GET_LIMIT_PAGES(desc);
126 info.seg_not_present = !GET_PRESENT(desc);
127 info.useable = GET_USEABLE(desc);
128 info.lm = GET_LONGMODE(desc);
129
130 if (copy_to_user(u_info, &info, sizeof(info)))
131 return -EFAULT;
132 return 0;
133}
134
135asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
136{
137 return do_get_thread_area(&current->thread, u_info);
138}
139
140
141int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
142{
143 struct n_desc_struct *desc;
144 struct user_desc info;
145 struct user_desc __user *cp;
146 int idx;
147
148 cp = (void __user *)childregs->rsi;
149 if (copy_from_user(&info, cp, sizeof(info)))
150 return -EFAULT;
151 if (LDT_empty(&info))
152 return -EINVAL;
153
154 idx = info.entry_number;
155 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
156 return -EINVAL;
157
158 desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
159 desc->a = LDT_entry_a(&info);
160 desc->b = LDT_entry_b(&info);
161
162 return 0;
163}
diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S
deleted file mode 100644
index b383be00baec..000000000000
--- a/arch/x86/ia32/vsyscall-sigreturn.S
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
4 * to enter the kernel.
5 * This file is #include'd by vsyscall-*.S to define them after the
6 * vsyscall entry point. The addresses we get for these entry points
7 * by doing ".balign 32" must match in both versions of the page.
8 */
9
10 .code32
11 .section .text.sigreturn,"ax"
12 .balign 32
13 .globl __kernel_sigreturn
14 .type __kernel_sigreturn,@function
15__kernel_sigreturn:
16.LSTART_sigreturn:
17 popl %eax
18 movl $__NR_ia32_sigreturn, %eax
19 SYSCALL_ENTER_KERNEL
20.LEND_sigreturn:
21 .size __kernel_sigreturn,.-.LSTART_sigreturn
22
23 .section .text.rtsigreturn,"ax"
24 .balign 32
25 .globl __kernel_rt_sigreturn
26 .type __kernel_rt_sigreturn,@function
27__kernel_rt_sigreturn:
28.LSTART_rt_sigreturn:
29 movl $__NR_ia32_rt_sigreturn, %eax
30 SYSCALL_ENTER_KERNEL
31.LEND_rt_sigreturn:
32 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
33
34 .section .eh_frame,"a",@progbits
35.LSTARTFRAMES:
36 .long .LENDCIES-.LSTARTCIES
37.LSTARTCIES:
38 .long 0 /* CIE ID */
39 .byte 1 /* Version number */
40 .string "zRS" /* NUL-terminated augmentation string */
41 .uleb128 1 /* Code alignment factor */
42 .sleb128 -4 /* Data alignment factor */
43 .byte 8 /* Return address register column */
44 .uleb128 1 /* Augmentation value length */
45 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
46 .byte 0x0c /* DW_CFA_def_cfa */
47 .uleb128 4
48 .uleb128 4
49 .byte 0x88 /* DW_CFA_offset, column 0x8 */
50 .uleb128 1
51 .align 4
52.LENDCIES:
53
54 .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
55.LSTARTFDE2:
56 .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
57 /* HACK: The dwarf2 unwind routines will subtract 1 from the
58 return address to get an address in the middle of the
59 presumed call instruction. Since we didn't get here via
60 a call, we need to include the nop before the real start
61 to make up for it. */
62 .long .LSTART_sigreturn-1-. /* PC-relative start address */
63 .long .LEND_sigreturn-.LSTART_sigreturn+1
64 .uleb128 0 /* Augmentation length */
65 /* What follows are the instructions for the table generation.
66 We record the locations of each register saved. This is
67 complicated by the fact that the "CFA" is always assumed to
68 be the value of the stack pointer in the caller. This means
69 that we must define the CFA of this body of code to be the
70 saved value of the stack pointer in the sigcontext. Which
71 also means that there is no fixed relation to the other
72 saved registers, which means that we must use DW_CFA_expression
73 to compute their addresses. It also means that when we
74 adjust the stack with the popl, we have to do it all over again. */
75
76#define do_cfa_expr(offset) \
77 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
78 .uleb128 1f-0f; /* length */ \
790: .byte 0x74; /* DW_OP_breg4 */ \
80 .sleb128 offset; /* offset */ \
81 .byte 0x06; /* DW_OP_deref */ \
821:
83
84#define do_expr(regno, offset) \
85 .byte 0x10; /* DW_CFA_expression */ \
86 .uleb128 regno; /* regno */ \
87 .uleb128 1f-0f; /* length */ \
880: .byte 0x74; /* DW_OP_breg4 */ \
89 .sleb128 offset; /* offset */ \
901:
91
92 do_cfa_expr(IA32_SIGCONTEXT_esp+4)
93 do_expr(0, IA32_SIGCONTEXT_eax+4)
94 do_expr(1, IA32_SIGCONTEXT_ecx+4)
95 do_expr(2, IA32_SIGCONTEXT_edx+4)
96 do_expr(3, IA32_SIGCONTEXT_ebx+4)
97 do_expr(5, IA32_SIGCONTEXT_ebp+4)
98 do_expr(6, IA32_SIGCONTEXT_esi+4)
99 do_expr(7, IA32_SIGCONTEXT_edi+4)
100 do_expr(8, IA32_SIGCONTEXT_eip+4)
101
102 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
103
104 do_cfa_expr(IA32_SIGCONTEXT_esp)
105 do_expr(0, IA32_SIGCONTEXT_eax)
106 do_expr(1, IA32_SIGCONTEXT_ecx)
107 do_expr(2, IA32_SIGCONTEXT_edx)
108 do_expr(3, IA32_SIGCONTEXT_ebx)
109 do_expr(5, IA32_SIGCONTEXT_ebp)
110 do_expr(6, IA32_SIGCONTEXT_esi)
111 do_expr(7, IA32_SIGCONTEXT_edi)
112 do_expr(8, IA32_SIGCONTEXT_eip)
113
114 .align 4
115.LENDFDE2:
116
117 .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
118.LSTARTFDE3:
119 .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
120 /* HACK: See above wrt unwind library assumptions. */
121 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
122 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
123 .uleb128 0 /* Augmentation */
124 /* What follows are the instructions for the table generation.
125 We record the locations of each register saved. This is
126 slightly less complicated than the above, since we don't
127 modify the stack pointer in the process. */
128
129 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
130 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
131 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
132 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
133 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
134 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
135 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
136 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
137 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
138
139 .align 4
140.LENDFDE3:
141
142#include "../../x86/kernel/vsyscall-note_32.S"
143
diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S
deleted file mode 100644
index ae056e553d13..000000000000
--- a/arch/x86/ia32/vsyscall-sysenter.S
+++ /dev/null
@@ -1,95 +0,0 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 */
4
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7
8 .code32
9 .text
10 .section .text.vsyscall,"ax"
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 push %ecx
16.Lpush_ecx:
17 push %edx
18.Lpush_edx:
19 push %ebp
20.Lenter_kernel:
21 movl %esp,%ebp
22 sysenter
23 .space 7,0x90
24 jmp .Lenter_kernel
25 /* 16: System call normal return point is here! */
26 pop %ebp
27.Lpop_ebp:
28 pop %edx
29.Lpop_edx:
30 pop %ecx
31.Lpop_ecx:
32 ret
33.LEND_vsyscall:
34 .size __kernel_vsyscall,.-.LSTART_vsyscall
35
36 .section .eh_frame,"a",@progbits
37.LSTARTFRAME:
38 .long .LENDCIE-.LSTARTCIE
39.LSTARTCIE:
40 .long 0 /* CIE ID */
41 .byte 1 /* Version number */
42 .string "zR" /* NUL-terminated augmentation string */
43 .uleb128 1 /* Code alignment factor */
44 .sleb128 -4 /* Data alignment factor */
45 .byte 8 /* Return address register column */
46 .uleb128 1 /* Augmentation value length */
47 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
48 .byte 0x0c /* DW_CFA_def_cfa */
49 .uleb128 4
50 .uleb128 4
51 .byte 0x88 /* DW_CFA_offset, column 0x8 */
52 .uleb128 1
53 .align 4
54.LENDCIE:
55
56 .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
57.LSTARTFDE1:
58 .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
59 .long .LSTART_vsyscall-. /* PC-relative start address */
60 .long .LEND_vsyscall-.LSTART_vsyscall
61 .uleb128 0 /* Augmentation length */
62 /* What follows are the instructions for the table generation.
63 We have to record all changes of the stack pointer. */
64 .byte 0x04 /* DW_CFA_advance_loc4 */
65 .long .Lpush_ecx-.LSTART_vsyscall
66 .byte 0x0e /* DW_CFA_def_cfa_offset */
67 .byte 0x08 /* RA at offset 8 now */
68 .byte 0x04 /* DW_CFA_advance_loc4 */
69 .long .Lpush_edx-.Lpush_ecx
70 .byte 0x0e /* DW_CFA_def_cfa_offset */
71 .byte 0x0c /* RA at offset 12 now */
72 .byte 0x04 /* DW_CFA_advance_loc4 */
73 .long .Lenter_kernel-.Lpush_edx
74 .byte 0x0e /* DW_CFA_def_cfa_offset */
75 .byte 0x10 /* RA at offset 16 now */
76 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
77 /* Finally the epilogue. */
78 .byte 0x04 /* DW_CFA_advance_loc4 */
79 .long .Lpop_ebp-.Lenter_kernel
80 .byte 0x0e /* DW_CFA_def_cfa_offset */
81 .byte 0x12 /* RA at offset 12 now */
82 .byte 0xc5 /* DW_CFA_restore %ebp */
83 .byte 0x04 /* DW_CFA_advance_loc4 */
84 .long .Lpop_edx-.Lpop_ebp
85 .byte 0x0e /* DW_CFA_def_cfa_offset */
86 .byte 0x08 /* RA at offset 8 now */
87 .byte 0x04 /* DW_CFA_advance_loc4 */
88 .long .Lpop_ecx-.Lpop_edx
89 .byte 0x0e /* DW_CFA_def_cfa_offset */
90 .byte 0x04 /* RA at offset 4 now */
91 .align 4
92.LENDFDE1:
93
94#define SYSCALL_ENTER_KERNEL int $0x80
95#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds
deleted file mode 100644
index 1dc86ff5bcb9..000000000000
--- a/arch/x86/ia32/vsyscall.lds
+++ /dev/null
@@ -1,80 +0,0 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address. This script controls its layout.
4 */
5
6/* This must match <asm/fixmap.h>. */
7VSYSCALL_BASE = 0xffffe000;
8
9SECTIONS
10{
11 . = VSYSCALL_BASE + SIZEOF_HEADERS;
12
13 .hash : { *(.hash) } :text
14 .gnu.hash : { *(.gnu.hash) }
15 .dynsym : { *(.dynsym) }
16 .dynstr : { *(.dynstr) }
17 .gnu.version : { *(.gnu.version) }
18 .gnu.version_d : { *(.gnu.version_d) }
19 .gnu.version_r : { *(.gnu.version_r) }
20
21 /* This linker script is used both with -r and with -shared.
22 For the layouts to match, we need to skip more than enough
23 space for the dynamic symbol table et al. If this amount
24 is insufficient, ld -shared will barf. Just increase it here. */
25 . = VSYSCALL_BASE + 0x400;
26
27 .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090
28
29 /* This is an 32bit object and we cannot easily get the offsets
30 into the 64bit kernel. Just hardcode them here. This assumes
31 that all the stubs don't need more than 0x100 bytes. */
32 . = VSYSCALL_BASE + 0x500;
33
34 .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090
35
36 . = VSYSCALL_BASE + 0x600;
37
38 .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090
39
40 .note : { *(.note.*) } :text :note
41 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
42 .eh_frame : { KEEP (*(.eh_frame)) } :text
43 .dynamic : { *(.dynamic) } :text :dynamic
44 .useless : {
45 *(.got.plt) *(.got)
46 *(.data .data.* .gnu.linkonce.d.*)
47 *(.dynbss)
48 *(.bss .bss.* .gnu.linkonce.b.*)
49 } :text
50}
51
52/*
53 * We must supply the ELF program headers explicitly to get just one
54 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
55 */
56PHDRS
57{
58 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
59 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
60 note PT_NOTE FLAGS(4); /* PF_R */
61 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
62}
63
64/*
65 * This controls what symbols we export from the DSO.
66 */
67VERSION
68{
69 LINUX_2.5 {
70 global:
71 __kernel_vsyscall;
72 __kernel_sigreturn;
73 __kernel_rt_sigreturn;
74
75 local: *;
76 };
77}
78
79/* The ELF entry point can be used to set the AT_SYSINFO value. */
80ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 38573340b143..6f813009d44b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,9 +1,91 @@
1ifeq ($(CONFIG_X86_32),y) 1#
2include ${srctree}/arch/x86/kernel/Makefile_32 2# Makefile for the linux kernel.
3else 3#
4include ${srctree}/arch/x86/kernel/Makefile_64 4
5extra-y := head_$(BITS).o init_task.o vmlinux.lds
6extra-$(CONFIG_X86_64) += head64.o
7
8CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
9CFLAGS_vsyscall_64.o := $(PROFILING) -g0
10
11obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
12obj-y += traps_$(BITS).o irq_$(BITS).o
13obj-y += time_$(BITS).o ioport.o ldt.o
14obj-y += setup_$(BITS).o i8259_$(BITS).o
15obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
16obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
17obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
18obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o
19obj-y += quirks.o i8237.o topology.o kdebugfs.o
20obj-y += alternative.o i8253.o
21obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o
22obj-y += tsc_$(BITS).o io_delay.o rtc.o
23
24obj-y += i387.o
25obj-y += ptrace.o
26obj-y += ds.o
27obj-$(CONFIG_X86_32) += tls.o
28obj-$(CONFIG_IA32_EMULATION) += tls.o
29obj-y += step.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o
31obj-y += cpu/
32obj-y += acpi/
33obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
34obj-$(CONFIG_X86_64) += reboot.o
35obj-$(CONFIG_MCA) += mca_32.o
36obj-$(CONFIG_X86_MSR) += msr.o
37obj-$(CONFIG_X86_CPUID) += cpuid.o
38obj-$(CONFIG_MICROCODE) += microcode.o
39obj-$(CONFIG_PCI) += early-quirks.o
40obj-$(CONFIG_APM) += apm_32.o
41obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
42obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o
43obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o
44obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
45obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o
46obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
47obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
48obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
49obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
50obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
51obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
52obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
53obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
54obj-$(CONFIG_X86_VSMP) += vsmp_64.o
55obj-$(CONFIG_KPROBES) += kprobes.o
56obj-$(CONFIG_MODULES) += module_$(BITS).o
57obj-$(CONFIG_ACPI_SRAT) += srat_32.o
58obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
59obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
60obj-$(CONFIG_VM86) += vm86_32.o
61obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
62
63obj-$(CONFIG_HPET_TIMER) += hpet.o
64
65obj-$(CONFIG_K8_NB) += k8.o
66obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
67obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
68obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
69
70obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
71obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
72
73ifdef CONFIG_INPUT_PCSPKR
74obj-y += pcspeaker.o
5endif 75endif
6 76
7# Workaround to delete .lds files with make clean 77obj-$(CONFIG_SCx200) += scx200_32.o
8# The problem is that we do not enter Makefile_32 with make clean. 78
9clean-files := vsyscall*.lds vsyscall*.so 79###
80# 64 bit specific files
81ifeq ($(CONFIG_X86_64),y)
82 obj-y += genapic_64.o genapic_flat_64.o
83 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
84 obj-$(CONFIG_AUDIT) += audit_64.o
85 obj-$(CONFIG_PM) += suspend_64.o
86 obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
87
88 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
89 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
90 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
91endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
deleted file mode 100644
index a7bc93c27662..000000000000
--- a/arch/x86/kernel/Makefile_32
+++ /dev/null
@@ -1,88 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_32.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ui386
7
8obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
9 ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
10 pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
11 quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += cpu/
15obj-y += acpi/
16obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
17obj-$(CONFIG_MCA) += mca_32.o
18obj-$(CONFIG_X86_MSR) += msr.o
19obj-$(CONFIG_X86_CPUID) += cpuid.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_PCI) += early-quirks.o
22obj-$(CONFIG_APM) += apm_32.o
23obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
24obj-$(CONFIG_SMP) += smpcommon_32.o
25obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
26obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
27obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
28obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
29obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
30obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o
31obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
32obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
33obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
34obj-$(CONFIG_KPROBES) += kprobes_32.o
35obj-$(CONFIG_MODULES) += module_32.o
36obj-y += sysenter_32.o vsyscall_32.o
37obj-$(CONFIG_ACPI_SRAT) += srat_32.o
38obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
39obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
40obj-$(CONFIG_VM86) += vm86_32.o
41obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
42obj-$(CONFIG_HPET_TIMER) += hpet.o
43obj-$(CONFIG_K8_NB) += k8.o
44obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
45
46obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
47obj-$(CONFIG_PARAVIRT) += paravirt_32.o
48obj-y += pcspeaker.o
49
50obj-$(CONFIG_SCx200) += scx200_32.o
51
52# vsyscall_32.o contains the vsyscall DSO images as __initdata.
53# We must build both images before we can assemble it.
54# Note: kbuild does not track this dependency due to usage of .incbin
55$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
56targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
57targets += vsyscall-note_32.o vsyscall_32.lds
58
59# The DSO images are built using a special linker script.
60quiet_cmd_syscall = SYSCALL $@
61 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
62 -Wl,-T,$(filter-out FORCE,$^) -o $@
63
64export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386
65
66vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
67 $(call ld-option, -Wl$(comma)--hash-style=sysv)
68SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
69SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
70
71$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
72$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
73 $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
74 $(call if_changed,syscall)
75
76# We also create a special relocatable object that should mirror the symbol
77# table and layout of the linked DSO. With ld -R we can then refer to
78# these symbols in the kernel code rather than hand-coded addresses.
79extra-y += vsyscall-syms.o
80$(obj)/built-in.o: $(obj)/vsyscall-syms.o
81$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
82
83SYSCFLAGS_vsyscall-syms.o = -r
84$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
85 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
86 $(call if_changed,syscall)
87
88
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
deleted file mode 100644
index 5a88890d8ee9..000000000000
--- a/arch/x86/kernel/Makefile_64
+++ /dev/null
@@ -1,45 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_64.o head64.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ux86_64
7EXTRA_AFLAGS := -traditional
8
9obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
10 ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
11 x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
12 setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
13 pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
14 i8253.o
15
16obj-$(CONFIG_STACKTRACE) += stacktrace.o
17obj-y += cpu/
18obj-y += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
23obj-y += apic_64.o nmi_64.o
24obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
25obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o
26obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
27obj-$(CONFIG_PM) += suspend_64.o
28obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
29obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
30obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
31obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
32obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
33obj-$(CONFIG_KPROBES) += kprobes_64.o
34obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
35obj-$(CONFIG_X86_VSMP) += vsmp_64.o
36obj-$(CONFIG_K8_NB) += k8.o
37obj-$(CONFIG_AUDIT) += audit_64.o
38
39obj-$(CONFIG_MODULES) += module_64.o
40obj-$(CONFIG_PCI) += early-quirks.o
41
42obj-y += topology.o
43obj-y += pcspeaker.o
44
45CFLAGS_vsyscall_64.o := $(PROFILING) -g0
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 1351c3982ee4..19d3d6e9d09b 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_ACPI) += boot.o 1obj-$(CONFIG_ACPI) += boot.o
2obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o 2obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
3 3
4ifneq ($(CONFIG_ACPI_PROCESSOR),) 4ifneq ($(CONFIG_ACPI_PROCESSOR),)
5obj-y += cstate.o processor.o 5obj-y += cstate.o processor.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..6bc815cd8cb3
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,87 @@
1/*
2 * sleep.c - x86-specific ACPI sleep support.
3 *
4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
6 */
7
8#include <linux/acpi.h>
9#include <linux/bootmem.h>
10#include <linux/dmi.h>
11#include <linux/cpumask.h>
12
13#include <asm/smp.h>
14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long acpi_copy_wakeup_routine(unsigned long);
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address) {
31 printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
32 return -ENOMEM;
33 }
34 memcpy((void *)acpi_wakeup_address, &wakeup_start,
35 &wakeup_end - &wakeup_start);
36 acpi_copy_wakeup_routine(acpi_wakeup_address);
37
38 return 0;
39}
40
41/*
42 * acpi_restore_state - undo effects of acpi_save_state_mem
43 */
44void acpi_restore_state_mem(void)
45{
46}
47
48
49/**
50 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
51 *
52 * We allocate a page from the first 1MB of memory for the wakeup
53 * routine for when we come back from a sleep state. The
54 * runtime allocator allows specification of <16MB pages, but not
55 * <1MB pages.
56 */
57void __init acpi_reserve_bootmem(void)
58{
59 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
60 printk(KERN_ERR
61 "ACPI: Wakeup code way too big, S3 disabled.\n");
62 return;
63 }
64
65 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
66 if (!acpi_wakeup_address)
67 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
68}
69
70
71static int __init acpi_sleep_setup(char *str)
72{
73 while ((str != NULL) && (*str != '\0')) {
74 if (strncmp(str, "s3_bios", 7) == 0)
75 acpi_realmode_flags |= 1;
76 if (strncmp(str, "s3_mode", 7) == 0)
77 acpi_realmode_flags |= 2;
78 if (strncmp(str, "s3_beep", 7) == 0)
79 acpi_realmode_flags |= 4;
80 str = strchr(str, ',');
81 if (str != NULL)
82 str += strspn(str, ", \t");
83 }
84 return 1;
85}
86
87__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
index 10699489cfe7..63fe5525e026 100644
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -12,76 +12,6 @@
12 12
13#include <asm/smp.h> 13#include <asm/smp.h>
14 14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address)
31 return 1;
32 memcpy((void *)acpi_wakeup_address, &wakeup_start,
33 &wakeup_end - &wakeup_start);
34 acpi_copy_wakeup_routine(acpi_wakeup_address);
35
36 return 0;
37}
38
39/*
40 * acpi_restore_state - undo effects of acpi_save_state_mem
41 */
42void acpi_restore_state_mem(void)
43{
44}
45
46/**
47 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
48 *
49 * We allocate a page from the first 1MB of memory for the wakeup
50 * routine for when we come back from a sleep state. The
51 * runtime allocator allows specification of <16MB pages, but not
52 * <1MB pages.
53 */
54void __init acpi_reserve_bootmem(void)
55{
56 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
57 printk(KERN_ERR
58 "ACPI: Wakeup code way too big, S3 disabled.\n");
59 return;
60 }
61
62 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
63 if (!acpi_wakeup_address)
64 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
65}
66
67static int __init acpi_sleep_setup(char *str)
68{
69 while ((str != NULL) && (*str != '\0')) {
70 if (strncmp(str, "s3_bios", 7) == 0)
71 acpi_realmode_flags |= 1;
72 if (strncmp(str, "s3_mode", 7) == 0)
73 acpi_realmode_flags |= 2;
74 if (strncmp(str, "s3_beep", 7) == 0)
75 acpi_realmode_flags |= 4;
76 str = strchr(str, ',');
77 if (str != NULL)
78 str += strspn(str, ", \t");
79 }
80 return 1;
81}
82
83__setup("acpi_sleep=", acpi_sleep_setup);
84
85/* Ouch, we want to delete this. We already have better version in userspace, in 15/* Ouch, we want to delete this. We already have better version in userspace, in
86 s2ram from suspend.sf.net project */ 16 s2ram from suspend.sf.net project */
87static __init int reset_videomode_after_s3(const struct dmi_system_id *d) 17static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
deleted file mode 100644
index da42de261ba8..000000000000
--- a/arch/x86/kernel/acpi/sleep_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/types.h>
32#include <linux/stddef.h>
33#include <linux/slab.h>
34#include <linux/pci.h>
35#include <linux/bootmem.h>
36#include <linux/acpi.h>
37#include <linux/cpumask.h>
38
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50/* --------------------------------------------------------------------------
51 Low-Level Sleep Support
52 -------------------------------------------------------------------------- */
53
54/* address in low memory of the wakeup routine. */
55unsigned long acpi_wakeup_address = 0;
56unsigned long acpi_realmode_flags;
57extern char wakeup_start, wakeup_end;
58
59extern unsigned long acpi_copy_wakeup_routine(unsigned long);
60
61/**
62 * acpi_save_state_mem - save kernel state
63 *
64 * Create an identity mapped page table and copy the wakeup routine to
65 * low memory.
66 */
67int acpi_save_state_mem(void)
68{
69 memcpy((void *)acpi_wakeup_address, &wakeup_start,
70 &wakeup_end - &wakeup_start);
71 acpi_copy_wakeup_routine(acpi_wakeup_address);
72
73 return 0;
74}
75
76/*
77 * acpi_restore_state
78 */
79void acpi_restore_state_mem(void)
80{
81}
82
83/**
84 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
85 *
86 * We allocate a page in low memory for the wakeup
87 * routine for when we come back from a sleep state. The
88 * runtime allocator allows specification of <16M pages, but not
89 * <1M pages.
90 */
91void __init acpi_reserve_bootmem(void)
92{
93 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
94 if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
95 printk(KERN_CRIT
96 "ACPI: Wakeup code way too big, will crash on attempt"
97 " to suspend\n");
98}
99
100static int __init acpi_sleep_setup(char *str)
101{
102 while ((str != NULL) && (*str != '\0')) {
103 if (strncmp(str, "s3_bios", 7) == 0)
104 acpi_realmode_flags |= 1;
105 if (strncmp(str, "s3_mode", 7) == 0)
106 acpi_realmode_flags |= 2;
107 if (strncmp(str, "s3_beep", 7) == 0)
108 acpi_realmode_flags |= 4;
109 str = strchr(str, ',');
110 if (str != NULL)
111 str += strspn(str, ", \t");
112 }
113 return 1;
114}
115
116__setup("acpi_sleep=", acpi_sleep_setup);
117
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 1e931aaf2ef6..f53e3277f8e5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
1.text 1 .section .text.page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page.h> 4#include <asm/page.h>
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5ed3bc5c61d7..2e1b9e0d0767 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
344 call save_processor_state 344 call save_processor_state
345 345
346 movq $saved_context, %rax 346 movq $saved_context, %rax
347 movq %rsp, pt_regs_rsp(%rax) 347 movq %rsp, pt_regs_sp(%rax)
348 movq %rbp, pt_regs_rbp(%rax) 348 movq %rbp, pt_regs_bp(%rax)
349 movq %rsi, pt_regs_rsi(%rax) 349 movq %rsi, pt_regs_si(%rax)
350 movq %rdi, pt_regs_rdi(%rax) 350 movq %rdi, pt_regs_di(%rax)
351 movq %rbx, pt_regs_rbx(%rax) 351 movq %rbx, pt_regs_bx(%rax)
352 movq %rcx, pt_regs_rcx(%rax) 352 movq %rcx, pt_regs_cx(%rax)
353 movq %rdx, pt_regs_rdx(%rax) 353 movq %rdx, pt_regs_dx(%rax)
354 movq %r8, pt_regs_r8(%rax) 354 movq %r8, pt_regs_r8(%rax)
355 movq %r9, pt_regs_r9(%rax) 355 movq %r9, pt_regs_r9(%rax)
356 movq %r10, pt_regs_r10(%rax) 356 movq %r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
360 movq %r14, pt_regs_r14(%rax) 360 movq %r14, pt_regs_r14(%rax)
361 movq %r15, pt_regs_r15(%rax) 361 movq %r15, pt_regs_r15(%rax)
362 pushfq 362 pushfq
363 popq pt_regs_eflags(%rax) 363 popq pt_regs_flags(%rax)
364 364
365 movq $.L97, saved_rip(%rip) 365 movq $.L97, saved_rip(%rip)
366 366
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
391 movq %rbx, %cr2 391 movq %rbx, %cr2
392 movq saved_context_cr0(%rax), %rbx 392 movq saved_context_cr0(%rax), %rbx
393 movq %rbx, %cr0 393 movq %rbx, %cr0
394 pushq pt_regs_eflags(%rax) 394 pushq pt_regs_flags(%rax)
395 popfq 395 popfq
396 movq pt_regs_rsp(%rax), %rsp 396 movq pt_regs_sp(%rax), %rsp
397 movq pt_regs_rbp(%rax), %rbp 397 movq pt_regs_bp(%rax), %rbp
398 movq pt_regs_rsi(%rax), %rsi 398 movq pt_regs_si(%rax), %rsi
399 movq pt_regs_rdi(%rax), %rdi 399 movq pt_regs_di(%rax), %rdi
400 movq pt_regs_rbx(%rax), %rbx 400 movq pt_regs_bx(%rax), %rbx
401 movq pt_regs_rcx(%rax), %rcx 401 movq pt_regs_cx(%rax), %rcx
402 movq pt_regs_rdx(%rax), %rdx 402 movq pt_regs_dx(%rax), %rdx
403 movq pt_regs_r8(%rax), %r8 403 movq pt_regs_r8(%rax), %r8
404 movq pt_regs_r9(%rax), %r9 404 movq pt_regs_r9(%rax), %r9
405 movq pt_regs_r10(%rax), %r10 405 movq pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d6405e0842b5..45d79ea890ae 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -273,6 +273,7 @@ struct smp_alt_module {
273}; 273};
274static LIST_HEAD(smp_alt_modules); 274static LIST_HEAD(smp_alt_modules);
275static DEFINE_SPINLOCK(smp_alt); 275static DEFINE_SPINLOCK(smp_alt);
276static int smp_mode = 1; /* protected by smp_alt */
276 277
277void alternatives_smp_module_add(struct module *mod, char *name, 278void alternatives_smp_module_add(struct module *mod, char *name,
278 void *locks, void *locks_end, 279 void *locks, void *locks_end,
@@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp)
341 342
342#ifdef CONFIG_LOCKDEP 343#ifdef CONFIG_LOCKDEP
343 /* 344 /*
344 * A not yet fixed binutils section handling bug prevents 345 * Older binutils section handling bug prevented
345 * alternatives-replacement from working reliably, so turn 346 * alternatives-replacement from working reliably.
346 * it off: 347 *
348 * If this still occurs then you should see a hang
349 * or crash shortly after this line:
347 */ 350 */
348 printk("lockdep: not fixing up alternatives.\n"); 351 printk("lockdep: fixing up alternatives.\n");
349 return;
350#endif 352#endif
351 353
352 if (noreplace_smp || smp_alt_once) 354 if (noreplace_smp || smp_alt_once)
@@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp)
354 BUG_ON(!smp && (num_online_cpus() > 1)); 356 BUG_ON(!smp && (num_online_cpus() > 1));
355 357
356 spin_lock_irqsave(&smp_alt, flags); 358 spin_lock_irqsave(&smp_alt, flags);
357 if (smp) { 359
360 /*
361 * Avoid unnecessary switches because it forces JIT based VMs to
362 * throw away all cached translations, which can be quite costly.
363 */
364 if (smp == smp_mode) {
365 /* nothing */
366 } else if (smp) {
358 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 367 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
359 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 368 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
360 clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 369 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
361 list_for_each_entry(mod, &smp_alt_modules, next) 370 list_for_each_entry(mod, &smp_alt_modules, next)
362 alternatives_smp_lock(mod->locks, mod->locks_end, 371 alternatives_smp_lock(mod->locks, mod->locks_end,
363 mod->text, mod->text_end); 372 mod->text, mod->text_end);
364 } else { 373 } else {
365 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 374 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
366 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 375 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
367 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 376 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
368 list_for_each_entry(mod, &smp_alt_modules, next) 377 list_for_each_entry(mod, &smp_alt_modules, next)
369 alternatives_smp_unlock(mod->locks, mod->locks_end, 378 alternatives_smp_unlock(mod->locks, mod->locks_end,
370 mod->text, mod->text_end); 379 mod->text, mod->text_end);
371 } 380 }
381 smp_mode = smp;
372 spin_unlock_irqrestore(&smp_alt, flags); 382 spin_unlock_irqrestore(&smp_alt, flags);
373} 383}
374 384
@@ -431,8 +441,9 @@ void __init alternative_instructions(void)
431 if (smp_alt_once) { 441 if (smp_alt_once) {
432 if (1 == num_possible_cpus()) { 442 if (1 == num_possible_cpus()) {
433 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 443 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
434 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 444 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
435 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 445 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
446
436 alternatives_smp_unlock(__smp_locks, __smp_locks_end, 447 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
437 _text, _etext); 448 _text, _etext);
438 } 449 }
@@ -440,7 +451,10 @@ void __init alternative_instructions(void)
440 alternatives_smp_module_add(NULL, "core kernel", 451 alternatives_smp_module_add(NULL, "core kernel",
441 __smp_locks, __smp_locks_end, 452 __smp_locks, __smp_locks_end,
442 _text, _etext); 453 _text, _etext);
443 alternatives_smp_switch(0); 454
455 /* Only switch to UP mode if we don't immediately boot others */
456 if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
457 alternatives_smp_switch(0);
444 } 458 }
445#endif 459#endif
446 apply_paravirt(__parainstructions, __parainstructions_end); 460 apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5b6992799c9d..608152a2a05e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than 6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 8 * because only the bootmem allocator can allocate 32+MB.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */ 11 */
12#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0;
30int gart_iommu_aperture_allowed __initdata = 0; 30int gart_iommu_aperture_allowed __initdata = 0;
31 31
32int fallback_aper_order __initdata = 1; /* 64MB */ 32int fallback_aper_order __initdata = 1; /* 64MB */
33int fallback_aper_force __initdata = 0; 33int fallback_aper_force __initdata = 0;
34 34
35int fix_aperture __initdata = 1; 35int fix_aperture __initdata = 1;
36 36
@@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
49/* This code runs before the PCI subsystem is initialized, so just 49/* This code runs before the PCI subsystem is initialized, so just
50 access the northbridge directly. */ 50 access the northbridge directly. */
51 51
52static u32 __init allocate_aperture(void) 52static u32 __init allocate_aperture(void)
53{ 53{
54 u32 aper_size; 54 u32 aper_size;
55 void *p; 55 void *p;
56 56
57 if (fallback_aper_order > 7) 57 if (fallback_aper_order > 7)
58 fallback_aper_order = 7; 58 fallback_aper_order = 7;
59 aper_size = (32 * 1024 * 1024) << fallback_aper_order; 59 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
60 60
61 /* 61 /*
62 * Aperture has to be naturally aligned. This means an 2GB aperture won't 62 * Aperture has to be naturally aligned. This means a 2GB aperture
63 * have much chance of finding a place in the lower 4GB of memory. 63 * won't have much chance of finding a place in the lower 4GB of
64 * Unfortunately we cannot move it up because that would make the 64 * memory. Unfortunately we cannot move it up because that would
65 * IOMMU useless. 65 * make the IOMMU useless.
66 */ 66 */
67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); 67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
68 if (!p || __pa(p)+aper_size > 0xffffffff) { 68 if (!p || __pa(p)+aper_size > 0xffffffff) {
69 printk("Cannot allocate aperture memory hole (%p,%uK)\n", 69 printk(KERN_ERR
70 p, aper_size>>10); 70 "Cannot allocate aperture memory hole (%p,%uK)\n",
71 p, aper_size>>10);
71 if (p) 72 if (p)
72 free_bootmem(__pa(p), aper_size); 73 free_bootmem(__pa(p), aper_size);
73 return 0; 74 return 0;
74 } 75 }
75 printk("Mapping aperture over %d KB of RAM @ %lx\n", 76 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
76 aper_size >> 10, __pa(p)); 77 aper_size >> 10, __pa(p));
77 insert_aperture_resource((u32)__pa(p), aper_size); 78 insert_aperture_resource((u32)__pa(p), aper_size);
78 return (u32)__pa(p); 79
80 return (u32)__pa(p);
79} 81}
80 82
81static int __init aperture_valid(u64 aper_base, u32 aper_size) 83static int __init aperture_valid(u64 aper_base, u32 aper_size)
82{ 84{
83 if (!aper_base) 85 if (!aper_base)
84 return 0;
85 if (aper_size < 64*1024*1024) {
86 printk("Aperture too small (%d MB)\n", aper_size>>20);
87 return 0; 86 return 0;
88 } 87
89 if (aper_base + aper_size > 0x100000000UL) { 88 if (aper_base + aper_size > 0x100000000UL) {
90 printk("Aperture beyond 4GB. Ignoring.\n"); 89 printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
91 return 0; 90 return 0;
92 } 91 }
93 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { 92 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
94 printk("Aperture pointing to e820 RAM. Ignoring.\n"); 93 printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
95 return 0; 94 return 0;
96 } 95 }
96 if (aper_size < 64*1024*1024) {
97 printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
98 return 0;
99 }
100
97 return 1; 101 return 1;
98} 102}
99 103
100/* Find a PCI capability */ 104/* Find a PCI capability */
101static __u32 __init find_cap(int num, int slot, int func, int cap) 105static __u32 __init find_cap(int num, int slot, int func, int cap)
102{ 106{
103 u8 pos;
104 int bytes; 107 int bytes;
105 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) 108 u8 pos;
109
110 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
111 PCI_STATUS_CAP_LIST))
106 return 0; 112 return 0;
107 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); 113
108 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 114 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
115 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
109 u8 id; 116 u8 id;
110 pos &= ~3; 117
111 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); 118 pos &= ~3;
119 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
112 if (id == 0xff) 120 if (id == 0xff)
113 break; 121 break;
114 if (id == cap) 122 if (id == cap)
115 return pos; 123 return pos;
116 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 124 pos = read_pci_config_byte(num, slot, func,
117 } 125 pos+PCI_CAP_LIST_NEXT);
126 }
118 return 0; 127 return 0;
119} 128}
120 129
121/* Read a standard AGPv3 bridge header */ 130/* Read a standard AGPv3 bridge header */
122static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) 131static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
123{ 132{
124 u32 apsize; 133 u32 apsize;
125 u32 apsizereg; 134 u32 apsizereg;
126 int nbits; 135 int nbits;
127 u32 aper_low, aper_hi; 136 u32 aper_low, aper_hi;
128 u64 aper; 137 u64 aper;
129 138
130 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); 139 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
131 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); 140 apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
132 if (apsizereg == 0xffffffff) { 141 if (apsizereg == 0xffffffff) {
133 printk("APSIZE in AGP bridge unreadable\n"); 142 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
134 return 0; 143 return 0;
135 } 144 }
136 145
137 apsize = apsizereg & 0xfff; 146 apsize = apsizereg & 0xfff;
138 /* Some BIOS use weird encodings not in the AGPv3 table. */ 147 /* Some BIOS use weird encodings not in the AGPv3 table. */
139 if (apsize & 0xff) 148 if (apsize & 0xff)
140 apsize |= 0xf00; 149 apsize |= 0xf00;
141 nbits = hweight16(apsize); 150 nbits = hweight16(apsize);
142 *order = 7 - nbits; 151 *order = 7 - nbits;
143 if ((int)*order < 0) /* < 32MB */ 152 if ((int)*order < 0) /* < 32MB */
144 *order = 0; 153 *order = 0;
145 154
146 aper_low = read_pci_config(num,slot,func, 0x10); 155 aper_low = read_pci_config(num, slot, func, 0x10);
147 aper_hi = read_pci_config(num,slot,func,0x14); 156 aper_hi = read_pci_config(num, slot, func, 0x14);
148 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); 157 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
149 158
150 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 159 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
151 aper, 32 << *order, apsizereg); 160 aper, 32 << *order, apsizereg);
152 161
153 if (!aperture_valid(aper, (32*1024*1024) << *order)) 162 if (!aperture_valid(aper, (32*1024*1024) << *order))
154 return 0; 163 return 0;
155 return (u32)aper; 164 return (u32)aper;
156} 165}
157
158/* Look for an AGP bridge. Windows only expects the aperture in the
159 AGP bridge and some BIOS forget to initialize the Northbridge too.
160 Work around this here.
161
162 Do an PCI bus scan by hand because we're running before the PCI
163 subsystem.
164 166
165 All K8 AGP bridges are AGPv3 compliant, so we can do this scan 167/*
166 generically. It's probably overkill to always scan all slots because 168 * Look for an AGP bridge. Windows only expects the aperture in the
167 the AGP bridges should be always an own bus on the HT hierarchy, 169 * AGP bridge and some BIOS forget to initialize the Northbridge too.
168 but do it here for future safety. */ 170 * Work around this here.
171 *
172 * Do an PCI bus scan by hand because we're running before the PCI
173 * subsystem.
174 *
175 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
176 * generically. It's probably overkill to always scan all slots because
177 * the AGP bridges should be always an own bus on the HT hierarchy,
178 * but do it here for future safety.
179 */
169static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) 180static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
170{ 181{
171 int num, slot, func; 182 int num, slot, func;
172 183
173 /* Poor man's PCI discovery */ 184 /* Poor man's PCI discovery */
174 for (num = 0; num < 256; num++) { 185 for (num = 0; num < 256; num++) {
175 for (slot = 0; slot < 32; slot++) { 186 for (slot = 0; slot < 32; slot++) {
176 for (func = 0; func < 8; func++) { 187 for (func = 0; func < 8; func++) {
177 u32 class, cap; 188 u32 class, cap;
178 u8 type; 189 u8 type;
179 class = read_pci_config(num,slot,func, 190 class = read_pci_config(num, slot, func,
180 PCI_CLASS_REVISION); 191 PCI_CLASS_REVISION);
181 if (class == 0xffffffff) 192 if (class == 0xffffffff)
182 break; 193 break;
183 194
184 switch (class >> 16) { 195 switch (class >> 16) {
185 case PCI_CLASS_BRIDGE_HOST: 196 case PCI_CLASS_BRIDGE_HOST:
186 case PCI_CLASS_BRIDGE_OTHER: /* needed? */ 197 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
187 /* AGP bridge? */ 198 /* AGP bridge? */
188 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); 199 cap = find_cap(num, slot, func,
200 PCI_CAP_ID_AGP);
189 if (!cap) 201 if (!cap)
190 break; 202 break;
191 *valid_agp = 1; 203 *valid_agp = 1;
192 return read_agp(num,slot,func,cap,order); 204 return read_agp(num, slot, func, cap,
193 } 205 order);
194 206 }
207
195 /* No multi-function device? */ 208 /* No multi-function device? */
196 type = read_pci_config_byte(num,slot,func, 209 type = read_pci_config_byte(num, slot, func,
197 PCI_HEADER_TYPE); 210 PCI_HEADER_TYPE);
198 if (!(type & 0x80)) 211 if (!(type & 0x80))
199 break; 212 break;
200 } 213 }
201 } 214 }
202 } 215 }
203 printk("No AGP bridge found\n"); 216 printk(KERN_INFO "No AGP bridge found\n");
217
204 return 0; 218 return 0;
205} 219}
206 220
221static int gart_fix_e820 __initdata = 1;
222
223static int __init parse_gart_mem(char *p)
224{
225 if (!p)
226 return -EINVAL;
227
228 if (!strncmp(p, "off", 3))
229 gart_fix_e820 = 0;
230 else if (!strncmp(p, "on", 2))
231 gart_fix_e820 = 1;
232
233 return 0;
234}
235early_param("gart_fix_e820", parse_gart_mem);
236
237void __init early_gart_iommu_check(void)
238{
239 /*
240 * in case it is enabled before, esp for kexec/kdump,
241 * previous kernel already enable that. memset called
242 * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
243 * or second kernel have different position for GART hole. and new
244 * kernel could use hole as RAM that is still used by GART set by
245 * first kernel
246 * or BIOS forget to put that in reserved.
247 * try to update e820 to make that region as reserved.
248 */
249 int fix, num;
250 u32 ctl;
251 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
252 u64 aper_base = 0, last_aper_base = 0;
253 int aper_enabled = 0, last_aper_enabled = 0;
254
255 if (!early_pci_allowed())
256 return;
257
258 fix = 0;
259 for (num = 24; num < 32; num++) {
260 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
261 continue;
262
263 ctl = read_pci_config(0, num, 3, 0x90);
264 aper_enabled = ctl & 1;
265 aper_order = (ctl >> 1) & 7;
266 aper_size = (32 * 1024 * 1024) << aper_order;
267 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
268 aper_base <<= 25;
269
270 if ((last_aper_order && aper_order != last_aper_order) ||
271 (last_aper_base && aper_base != last_aper_base) ||
272 (last_aper_enabled && aper_enabled != last_aper_enabled)) {
273 fix = 1;
274 break;
275 }
276 last_aper_order = aper_order;
277 last_aper_base = aper_base;
278 last_aper_enabled = aper_enabled;
279 }
280
281 if (!fix && !aper_enabled)
282 return;
283
284 if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
285 fix = 1;
286
287 if (gart_fix_e820 && !fix && aper_enabled) {
288 if (e820_any_mapped(aper_base, aper_base + aper_size,
289 E820_RAM)) {
290 /* reserved it, so we can resuse it in second kernel */
291 printk(KERN_INFO "update e820 for GART\n");
292 add_memory_region(aper_base, aper_size, E820_RESERVED);
293 update_e820();
294 }
295 return;
296 }
297
298 /* different nodes have different setting, disable them all at first*/
299 for (num = 24; num < 32; num++) {
300 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
301 continue;
302
303 ctl = read_pci_config(0, num, 3, 0x90);
304 ctl &= ~1;
305 write_pci_config(0, num, 3, 0x90, ctl);
306 }
307
308}
309
207void __init gart_iommu_hole_init(void) 310void __init gart_iommu_hole_init(void)
208{ 311{
209 int fix, num;
210 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 312 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
211 u64 aper_base, last_aper_base = 0; 313 u64 aper_base, last_aper_base = 0;
212 int valid_agp = 0; 314 int fix, num, valid_agp = 0;
315 int node;
213 316
214 if (gart_iommu_aperture_disabled || !fix_aperture || 317 if (gart_iommu_aperture_disabled || !fix_aperture ||
215 !early_pci_allowed()) 318 !early_pci_allowed())
@@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void)
218 printk(KERN_INFO "Checking aperture...\n"); 321 printk(KERN_INFO "Checking aperture...\n");
219 322
220 fix = 0; 323 fix = 0;
221 for (num = 24; num < 32; num++) { 324 node = 0;
325 for (num = 24; num < 32; num++) {
222 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 326 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
223 continue; 327 continue;
224 328
225 iommu_detected = 1; 329 iommu_detected = 1;
226 gart_iommu_aperture = 1; 330 gart_iommu_aperture = 1;
227 331
228 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 332 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
229 aper_size = (32 * 1024 * 1024) << aper_order; 333 aper_size = (32 * 1024 * 1024) << aper_order;
230 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 334 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
231 aper_base <<= 25; 335 aper_base <<= 25;
336
337 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
338 node, aper_base, aper_size >> 20);
339 node++;
232 340
233 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
234 aper_base, aper_size>>20);
235
236 if (!aperture_valid(aper_base, aper_size)) { 341 if (!aperture_valid(aper_base, aper_size)) {
237 fix = 1; 342 fix = 1;
238 break; 343 break;
239 } 344 }
240 345
241 if ((last_aper_order && aper_order != last_aper_order) || 346 if ((last_aper_order && aper_order != last_aper_order) ||
@@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void)
245 } 350 }
246 last_aper_order = aper_order; 351 last_aper_order = aper_order;
247 last_aper_base = aper_base; 352 last_aper_base = aper_base;
248 } 353 }
249 354
250 if (!fix && !fallback_aper_force) { 355 if (!fix && !fallback_aper_force) {
251 if (last_aper_base) { 356 if (last_aper_base) {
252 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 357 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
358
253 insert_aperture_resource((u32)last_aper_base, n); 359 insert_aperture_resource((u32)last_aper_base, n);
254 } 360 }
255 return; 361 return;
256 } 362 }
257 363
258 if (!fallback_aper_force) 364 if (!fallback_aper_force)
259 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 365 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
260 366
261 if (aper_alloc) { 367 if (aper_alloc) {
262 /* Got the aperture from the AGP bridge */ 368 /* Got the aperture from the AGP bridge */
263 } else if (swiotlb && !valid_agp) { 369 } else if (swiotlb && !valid_agp) {
264 /* Do nothing */ 370 /* Do nothing */
265 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || 371 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
266 force_iommu || 372 force_iommu ||
267 valid_agp || 373 valid_agp ||
268 fallback_aper_force) { 374 fallback_aper_force) {
269 printk("Your BIOS doesn't leave a aperture memory hole\n"); 375 printk(KERN_ERR
270 printk("Please enable the IOMMU option in the BIOS setup\n"); 376 "Your BIOS doesn't leave a aperture memory hole\n");
271 printk("This costs you %d MB of RAM\n", 377 printk(KERN_ERR
272 32 << fallback_aper_order); 378 "Please enable the IOMMU option in the BIOS setup\n");
379 printk(KERN_ERR
380 "This costs you %d MB of RAM\n",
381 32 << fallback_aper_order);
273 382
274 aper_order = fallback_aper_order; 383 aper_order = fallback_aper_order;
275 aper_alloc = allocate_aperture(); 384 aper_alloc = allocate_aperture();
276 if (!aper_alloc) { 385 if (!aper_alloc) {
277 /* Could disable AGP and IOMMU here, but it's probably 386 /*
278 not worth it. But the later users cannot deal with 387 * Could disable AGP and IOMMU here, but it's
279 bad apertures and turning on the aperture over memory 388 * probably not worth it. But the later users
280 causes very strange problems, so it's better to 389 * cannot deal with bad apertures and turning
281 panic early. */ 390 * on the aperture over memory causes very
391 * strange problems, so it's better to panic
392 * early.
393 */
282 panic("Not enough memory for aperture"); 394 panic("Not enough memory for aperture");
283 } 395 }
284 } else { 396 } else {
285 return; 397 return;
286 } 398 }
287 399
288 /* Fix up the north bridges */ 400 /* Fix up the north bridges */
289 for (num = 24; num < 32; num++) { 401 for (num = 24; num < 32; num++) {
290 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 402 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
291 continue; 403 continue;
292 404
293 /* Don't enable translation yet. That is done later. 405 /*
294 Assume this BIOS didn't initialise the GART so 406 * Don't enable translation yet. That is done later.
295 just overwrite all previous bits */ 407 * Assume this BIOS didn't initialise the GART so
296 write_pci_config(0, num, 3, 0x90, aper_order<<1); 408 * just overwrite all previous bits
297 write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 409 */
298 } 410 write_pci_config(0, num, 3, 0x90, aper_order<<1);
299} 411 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
412 }
413}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a56c782653be..35a568ea8400 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -43,12 +43,10 @@
43#include <mach_apicdef.h> 43#include <mach_apicdef.h>
44#include <mach_ipi.h> 44#include <mach_ipi.h>
45 45
46#include "io_ports.h"
47
48/* 46/*
49 * Sanity check 47 * Sanity check
50 */ 48 */
51#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F 49#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
52# error SPURIOUS_APIC_VECTOR definition error 50# error SPURIOUS_APIC_VECTOR definition error
53#endif 51#endif
54 52
@@ -57,7 +55,7 @@
57 * 55 *
58 * -1=force-disable, +1=force-enable 56 * -1=force-disable, +1=force-enable
59 */ 57 */
60static int enable_local_apic __initdata = 0; 58static int enable_local_apic __initdata;
61 59
62/* Local APIC timer verification ok */ 60/* Local APIC timer verification ok */
63static int local_apic_timer_verify_ok; 61static int local_apic_timer_verify_ok;
@@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
101/* Local APIC was disabled by the BIOS and enabled by the kernel */ 99/* Local APIC was disabled by the BIOS and enabled by the kernel */
102static int enabled_via_apicbase; 100static int enabled_via_apicbase;
103 101
102static unsigned long apic_phys;
103
104/* 104/*
105 * Get the LAPIC version 105 * Get the LAPIC version
106 */ 106 */
@@ -110,7 +110,7 @@ static inline int lapic_get_version(void)
110} 110}
111 111
112/* 112/*
113 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
114 */ 114 */
115static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
116{ 116{
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138unsigned long safe_apic_wait_icr_idle(void) 138u32 safe_apic_wait_icr_idle(void)
139{ 139{
140 unsigned long send_status; 140 u32 send_status;
141 int timeout; 141 int timeout;
142 142
143 timeout = 0; 143 timeout = 0;
@@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void)
154/** 154/**
155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
156 */ 156 */
157void enable_NMI_through_LVT0 (void * dummy) 157void __cpuinit enable_NMI_through_LVT0(void)
158{ 158{
159 unsigned int v = APIC_DM_NMI; 159 unsigned int v = APIC_DM_NMI;
160 160
@@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void)
379 */ 379 */
380 if (local_apic_timer_disabled) { 380 if (local_apic_timer_disabled) {
381 /* No broadcast on UP ! */ 381 /* No broadcast on UP ! */
382 if (num_possible_cpus() > 1) 382 if (num_possible_cpus() > 1) {
383 lapic_clockevent.mult = 1;
383 setup_APIC_timer(); 384 setup_APIC_timer();
385 }
384 return; 386 return;
385 } 387 }
386 388
@@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void)
434 "with PM Timer: %ldms instead of 100ms\n", 436 "with PM Timer: %ldms instead of 100ms\n",
435 (long)res); 437 (long)res);
436 /* Correct the lapic counter value */ 438 /* Correct the lapic counter value */
437 res = (((u64) delta ) * pm_100ms); 439 res = (((u64) delta) * pm_100ms);
438 do_div(res, deltapm); 440 do_div(res, deltapm);
439 printk(KERN_INFO "APIC delta adjusted to PM-Timer: " 441 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
440 "%lu (%ld)\n", (unsigned long) res, delta); 442 "%lu (%ld)\n", (unsigned long) res, delta);
@@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void)
472 474
473 local_apic_timer_verify_ok = 1; 475 local_apic_timer_verify_ok = 1;
474 476
477 /*
478 * Do a sanity check on the APIC calibration result
479 */
480 if (calibration_result < (1000000 / HZ)) {
481 local_irq_enable();
482 printk(KERN_WARNING
483 "APIC frequency too slow, disabling apic timer\n");
484 /* No broadcast on UP ! */
485 if (num_possible_cpus() > 1)
486 setup_APIC_timer();
487 return;
488 }
489
475 /* We trust the pm timer based calibration */ 490 /* We trust the pm timer based calibration */
476 if (!pm_referenced) { 491 if (!pm_referenced) {
477 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 492 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void)
563 return; 578 return;
564 } 579 }
565 580
581 /*
582 * the NMI deadlock-detector uses this.
583 */
566 per_cpu(irq_stat, cpu).apic_timer_irqs++; 584 per_cpu(irq_stat, cpu).apic_timer_irqs++;
567 585
568 evt->event_handler(evt); 586 evt->event_handler(evt);
@@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void)
576 * [ if a single-CPU system runs an SMP kernel then we call the local 594 * [ if a single-CPU system runs an SMP kernel then we call the local
577 * interrupt as well. Thus we cannot inline the local irq ... ] 595 * interrupt as well. Thus we cannot inline the local irq ... ]
578 */ 596 */
579 597void smp_apic_timer_interrupt(struct pt_regs *regs)
580void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
581{ 598{
582 struct pt_regs *old_regs = set_irq_regs(regs); 599 struct pt_regs *old_regs = set_irq_regs(regs);
583 600
@@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier)
616 */ 633 */
617void clear_local_APIC(void) 634void clear_local_APIC(void)
618{ 635{
619 int maxlvt = lapic_get_maxlvt(); 636 int maxlvt;
620 unsigned long v; 637 u32 v;
638
639 /* APIC hasn't been mapped yet */
640 if (!apic_phys)
641 return;
621 642
643 maxlvt = lapic_get_maxlvt();
622 /* 644 /*
623 * Masking an LVT entry can trigger a local APIC error 645 * Masking an LVT entry can trigger a local APIC error
624 * if the vector is zero. Mask LVTERR first to prevent this. 646 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void)
976 value |= APIC_LVT_LEVEL_TRIGGER; 998 value |= APIC_LVT_LEVEL_TRIGGER;
977 apic_write_around(APIC_LVT1, value); 999 apic_write_around(APIC_LVT1, value);
978 1000
979 if (integrated && !esr_disable) { /* !82489DX */ 1001 if (integrated && !esr_disable) {
1002 /* !82489DX */
980 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
981 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
982 apic_write(APIC_ESR, 0); 1005 apic_write(APIC_ESR, 0);
@@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void)
1020/* 1043/*
1021 * Detect and initialize APIC 1044 * Detect and initialize APIC
1022 */ 1045 */
1023static int __init detect_init_APIC (void) 1046static int __init detect_init_APIC(void)
1024{ 1047{
1025 u32 h, l, features; 1048 u32 h, l, features;
1026 1049
@@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void)
1077 printk(KERN_WARNING "Could not enable APIC!\n"); 1100 printk(KERN_WARNING "Could not enable APIC!\n");
1078 return -1; 1101 return -1;
1079 } 1102 }
1080 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1103 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1081 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1104 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1082 1105
1083 /* The BIOS may have set up the APIC at some other address */ 1106 /* The BIOS may have set up the APIC at some other address */
@@ -1104,8 +1127,6 @@ no_apic:
1104 */ 1127 */
1105void __init init_apic_mappings(void) 1128void __init init_apic_mappings(void)
1106{ 1129{
1107 unsigned long apic_phys;
1108
1109 /* 1130 /*
1110 * If no local APIC can be found then set up a fake all 1131 * If no local APIC can be found then set up a fake all
1111 * zeroes page to simulate the local APIC and another 1132 * zeroes page to simulate the local APIC and another
@@ -1164,10 +1185,10 @@ fake_ioapic_page:
1164 * This initializes the IO-APIC and APIC hardware if this is 1185 * This initializes the IO-APIC and APIC hardware if this is
1165 * a UP kernel. 1186 * a UP kernel.
1166 */ 1187 */
1167int __init APIC_init_uniprocessor (void) 1188int __init APIC_init_uniprocessor(void)
1168{ 1189{
1169 if (enable_local_apic < 0) 1190 if (enable_local_apic < 0)
1170 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1191 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1171 1192
1172 if (!smp_found_config && !cpu_has_apic) 1193 if (!smp_found_config && !cpu_has_apic)
1173 return -1; 1194 return -1;
@@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void)
1179 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1200 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1180 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1201 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1181 boot_cpu_physical_apicid); 1202 boot_cpu_physical_apicid);
1182 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1203 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1183 return -1; 1204 return -1;
1184 } 1205 }
1185 1206
@@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void)
1210} 1231}
1211 1232
1212/* 1233/*
1213 * APIC command line parameters
1214 */
1215static int __init parse_lapic(char *arg)
1216{
1217 enable_local_apic = 1;
1218 return 0;
1219}
1220early_param("lapic", parse_lapic);
1221
1222static int __init parse_nolapic(char *arg)
1223{
1224 enable_local_apic = -1;
1225 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1226 return 0;
1227}
1228early_param("nolapic", parse_nolapic);
1229
1230static int __init parse_disable_lapic_timer(char *arg)
1231{
1232 local_apic_timer_disabled = 1;
1233 return 0;
1234}
1235early_param("nolapic_timer", parse_disable_lapic_timer);
1236
1237static int __init parse_lapic_timer_c2_ok(char *arg)
1238{
1239 local_apic_timer_c2_ok = 1;
1240 return 0;
1241}
1242early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1243
1244static int __init apic_set_verbosity(char *str)
1245{
1246 if (strcmp("debug", str) == 0)
1247 apic_verbosity = APIC_DEBUG;
1248 else if (strcmp("verbose", str) == 0)
1249 apic_verbosity = APIC_VERBOSE;
1250 return 1;
1251}
1252
1253__setup("apic=", apic_set_verbosity);
1254
1255
1256/*
1257 * Local APIC interrupts 1234 * Local APIC interrupts
1258 */ 1235 */
1259 1236
@@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1306 6: Received illegal vector 1283 6: Received illegal vector
1307 7: Illegal register address 1284 7: Illegal register address
1308 */ 1285 */
1309 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1286 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1310 smp_processor_id(), v , v1); 1287 smp_processor_id(), v , v1);
1311 irq_exit(); 1288 irq_exit();
1312} 1289}
@@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1393 value = apic_read(APIC_LVT0); 1370 value = apic_read(APIC_LVT0);
1394 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1371 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1395 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1372 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1396 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); 1373 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1397 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1374 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1398 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1375 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1399 apic_write_around(APIC_LVT0, value); 1376 apic_write_around(APIC_LVT0, value);
@@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs);
1565static void apic_pm_activate(void) { } 1542static void apic_pm_activate(void) { }
1566 1543
1567#endif /* CONFIG_PM */ 1544#endif /* CONFIG_PM */
1545
1546/*
1547 * APIC command line parameters
1548 */
1549static int __init parse_lapic(char *arg)
1550{
1551 enable_local_apic = 1;
1552 return 0;
1553}
1554early_param("lapic", parse_lapic);
1555
1556static int __init parse_nolapic(char *arg)
1557{
1558 enable_local_apic = -1;
1559 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1560 return 0;
1561}
1562early_param("nolapic", parse_nolapic);
1563
1564static int __init parse_disable_lapic_timer(char *arg)
1565{
1566 local_apic_timer_disabled = 1;
1567 return 0;
1568}
1569early_param("nolapic_timer", parse_disable_lapic_timer);
1570
1571static int __init parse_lapic_timer_c2_ok(char *arg)
1572{
1573 local_apic_timer_c2_ok = 1;
1574 return 0;
1575}
1576early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1577
1578static int __init apic_set_verbosity(char *str)
1579{
1580 if (strcmp("debug", str) == 0)
1581 apic_verbosity = APIC_DEBUG;
1582 else if (strcmp("verbose", str) == 0)
1583 apic_verbosity = APIC_VERBOSE;
1584 return 1;
1585}
1586__setup("apic=", apic_set_verbosity);
1587
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index fa6cdee6d303..d8d03e09dea2 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -23,32 +23,37 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/module.h>
27#include <linux/ioport.h> 26#include <linux/ioport.h>
28#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
29 30
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/smp.h> 32#include <asm/smp.h>
32#include <asm/mtrr.h> 33#include <asm/mtrr.h>
33#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/hpet.h>
34#include <asm/pgalloc.h> 36#include <asm/pgalloc.h>
35#include <asm/mach_apic.h> 37#include <asm/mach_apic.h>
36#include <asm/nmi.h> 38#include <asm/nmi.h>
37#include <asm/idle.h> 39#include <asm/idle.h>
38#include <asm/proto.h> 40#include <asm/proto.h>
39#include <asm/timex.h> 41#include <asm/timex.h>
40#include <asm/hpet.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
42 43
43int apic_verbosity;
44int disable_apic_timer __cpuinitdata; 44int disable_apic_timer __cpuinitdata;
45static int apic_calibrate_pmtmr __initdata; 45static int apic_calibrate_pmtmr __initdata;
46int disable_apic;
46 47
47/* Local APIC timer works in C2? */ 48/* Local APIC timer works in C2 */
48int local_apic_timer_c2_ok; 49int local_apic_timer_c2_ok;
49EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 50EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
50 51
51static struct resource *ioapic_resources; 52/*
53 * Debug level, exported for io_apic.c
54 */
55int apic_verbosity;
56
52static struct resource lapic_resource = { 57static struct resource lapic_resource = {
53 .name = "Local APIC", 58 .name = "Local APIC",
54 .flags = IORESOURCE_MEM | IORESOURCE_BUSY, 59 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta,
60 struct clock_event_device *evt); 65 struct clock_event_device *evt);
61static void lapic_timer_setup(enum clock_event_mode mode, 66static void lapic_timer_setup(enum clock_event_mode mode,
62 struct clock_event_device *evt); 67 struct clock_event_device *evt);
63
64static void lapic_timer_broadcast(cpumask_t mask); 68static void lapic_timer_broadcast(cpumask_t mask);
65 69static void apic_pm_activate(void);
66static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen);
67 70
68static struct clock_event_device lapic_clockevent = { 71static struct clock_event_device lapic_clockevent = {
69 .name = "lapic", 72 .name = "lapic",
@@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = {
78}; 81};
79static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 82static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
80 83
84static unsigned long apic_phys;
85
86/*
87 * Get the LAPIC version
88 */
89static inline int lapic_get_version(void)
90{
91 return GET_APIC_VERSION(apic_read(APIC_LVR));
92}
93
94/*
95 * Check, if the APIC is integrated or a seperate chip
96 */
97static inline int lapic_is_integrated(void)
98{
99 return 1;
100}
101
102/*
103 * Check, whether this is a modern or a first generation APIC
104 */
105static int modern_apic(void)
106{
107 /* AMD systems use old APIC versions, so check the CPU */
108 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
109 boot_cpu_data.x86 >= 0xf)
110 return 1;
111 return lapic_get_version() >= 0x14;
112}
113
114void apic_wait_icr_idle(void)
115{
116 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
117 cpu_relax();
118}
119
120u32 safe_apic_wait_icr_idle(void)
121{
122 u32 send_status;
123 int timeout;
124
125 timeout = 0;
126 do {
127 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
128 if (!send_status)
129 break;
130 udelay(100);
131 } while (timeout++ < 1000);
132
133 return send_status;
134}
135
136/**
137 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
138 */
139void __cpuinit enable_NMI_through_LVT0(void)
140{
141 unsigned int v;
142
143 /* unmask and set to NMI */
144 v = APIC_DM_NMI;
145 apic_write(APIC_LVT0, v);
146}
147
148/**
149 * lapic_get_maxlvt - get the maximum number of local vector table entries
150 */
151int lapic_get_maxlvt(void)
152{
153 unsigned int v, maxlvt;
154
155 v = apic_read(APIC_LVR);
156 maxlvt = GET_APIC_MAXLVT(v);
157 return maxlvt;
158}
159
160/*
161 * This function sets up the local APIC timer, with a timeout of
162 * 'clocks' APIC bus clock. During calibration we actually call
163 * this function twice on the boot CPU, once with a bogus timeout
164 * value, second time for real. The other (noncalibrating) CPUs
165 * call this function only once, with the real, calibrated value.
166 *
167 * We do reads before writes even if unnecessary, to get around the
168 * P5 APIC double write bug.
169 */
170
171static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
172{
173 unsigned int lvtt_value, tmp_value;
174
175 lvtt_value = LOCAL_TIMER_VECTOR;
176 if (!oneshot)
177 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
178 if (!irqen)
179 lvtt_value |= APIC_LVT_MASKED;
180
181 apic_write(APIC_LVTT, lvtt_value);
182
183 /*
184 * Divide PICLK by 16
185 */
186 tmp_value = apic_read(APIC_TDCR);
187 apic_write(APIC_TDCR, (tmp_value
188 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
189 | APIC_TDR_DIV_16);
190
191 if (!oneshot)
192 apic_write(APIC_TMICT, clocks);
193}
194
195/*
196 * Setup extended LVT, AMD specific (K8, family 10h)
197 *
198 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
199 * MCE interrupts are supported. Thus MCE offset must be set to 0.
200 */
201
202#define APIC_EILVT_LVTOFF_MCE 0
203#define APIC_EILVT_LVTOFF_IBS 1
204
205static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
206{
207 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
208 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
209
210 apic_write(reg, v);
211}
212
213u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
214{
215 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
216 return APIC_EILVT_LVTOFF_MCE;
217}
218
219u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
220{
221 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
222 return APIC_EILVT_LVTOFF_IBS;
223}
224
225/*
226 * Program the next event, relative to now
227 */
81static int lapic_next_event(unsigned long delta, 228static int lapic_next_event(unsigned long delta,
82 struct clock_event_device *evt) 229 struct clock_event_device *evt)
83{ 230{
@@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta,
85 return 0; 232 return 0;
86} 233}
87 234
235/*
236 * Setup the lapic timer in periodic or oneshot mode
237 */
88static void lapic_timer_setup(enum clock_event_mode mode, 238static void lapic_timer_setup(enum clock_event_mode mode,
89 struct clock_event_device *evt) 239 struct clock_event_device *evt)
90{ 240{
@@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask)
127#endif 277#endif
128} 278}
129 279
130static void apic_pm_activate(void); 280/*
281 * Setup the local APIC timer for this CPU. Copy the initilized values
282 * of the boot CPU and register the clock event in the framework.
283 */
284static void setup_APIC_timer(void)
285{
286 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
131 287
132void apic_wait_icr_idle(void) 288 memcpy(levt, &lapic_clockevent, sizeof(*levt));
289 levt->cpumask = cpumask_of_cpu(smp_processor_id());
290
291 clockevents_register_device(levt);
292}
293
294/*
295 * In this function we calibrate APIC bus clocks to the external
296 * timer. Unfortunately we cannot use jiffies and the timer irq
297 * to calibrate, since some later bootup code depends on getting
298 * the first irq? Ugh.
299 *
300 * We want to do the calibration only once since we
301 * want to have local timer irqs syncron. CPUs connected
302 * by the same APIC bus have the very same bus frequency.
303 * And we want to have irqs off anyways, no accidental
304 * APIC irq that way.
305 */
306
307#define TICK_COUNT 100000000
308
309static void __init calibrate_APIC_clock(void)
133{ 310{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 311 unsigned apic, apic_start;
135 cpu_relax(); 312 unsigned long tsc, tsc_start;
313 int result;
314
315 local_irq_disable();
316
317 /*
318 * Put whatever arbitrary (but long enough) timeout
319 * value into the APIC clock, we just want to get the
320 * counter running for calibration.
321 *
322 * No interrupt enable !
323 */
324 __setup_APIC_LVTT(250000000, 0, 0);
325
326 apic_start = apic_read(APIC_TMCCT);
327#ifdef CONFIG_X86_PM_TIMER
328 if (apic_calibrate_pmtmr && pmtmr_ioport) {
329 pmtimer_wait(5000); /* 5ms wait */
330 apic = apic_read(APIC_TMCCT);
331 result = (apic_start - apic) * 1000L / 5;
332 } else
333#endif
334 {
335 rdtscll(tsc_start);
336
337 do {
338 apic = apic_read(APIC_TMCCT);
339 rdtscll(tsc);
340 } while ((tsc - tsc_start) < TICK_COUNT &&
341 (apic_start - apic) < TICK_COUNT);
342
343 result = (apic_start - apic) * 1000L * tsc_khz /
344 (tsc - tsc_start);
345 }
346
347 local_irq_enable();
348
349 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
350
351 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
352 result / 1000 / 1000, result / 1000 % 1000);
353
354 /* Calculate the scaled math multiplication factor */
355 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
356 lapic_clockevent.max_delta_ns =
357 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
358 lapic_clockevent.min_delta_ns =
359 clockevent_delta2ns(0xF, &lapic_clockevent);
360
361 calibration_result = result / HZ;
136} 362}
137 363
138unsigned int safe_apic_wait_icr_idle(void) 364/*
365 * Setup the boot APIC
366 *
367 * Calibrate and verify the result.
368 */
369void __init setup_boot_APIC_clock(void)
139{ 370{
140 unsigned int send_status; 371 /*
141 int timeout; 372 * The local apic timer can be disabled via the kernel commandline.
373 * Register the lapic timer as a dummy clock event source on SMP
374 * systems, so the broadcast mechanism is used. On UP systems simply
375 * ignore it.
376 */
377 if (disable_apic_timer) {
378 printk(KERN_INFO "Disabling APIC timer\n");
379 /* No broadcast on UP ! */
380 if (num_possible_cpus() > 1) {
381 lapic_clockevent.mult = 1;
382 setup_APIC_timer();
383 }
384 return;
385 }
142 386
143 timeout = 0; 387 printk(KERN_INFO "Using local APIC timer interrupts.\n");
144 do { 388 calibrate_APIC_clock();
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150 389
151 return send_status; 390 /*
391 * Do a sanity check on the APIC calibration result
392 */
393 if (calibration_result < (1000000 / HZ)) {
394 printk(KERN_WARNING
395 "APIC frequency too slow, disabling apic timer\n");
396 /* No broadcast on UP ! */
397 if (num_possible_cpus() > 1)
398 setup_APIC_timer();
399 return;
400 }
401
402 /*
403 * If nmi_watchdog is set to IO_APIC, we need the
404 * PIT/HPET going. Otherwise register lapic as a dummy
405 * device.
406 */
407 if (nmi_watchdog != NMI_IO_APIC)
408 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
409 else
410 printk(KERN_WARNING "APIC timer registered as dummy,"
411 " due to nmi_watchdog=1!\n");
412
413 setup_APIC_timer();
152} 414}
153 415
154void enable_NMI_through_LVT0 (void * dummy) 416/*
417 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
418 * C1E flag only in the secondary CPU, so when we detect the wreckage
419 * we already have enabled the boot CPU local apic timer. Check, if
420 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
421 * set the DUMMY flag again and force the broadcast mode in the
422 * clockevents layer.
423 */
424void __cpuinit check_boot_apic_timer_broadcast(void)
155{ 425{
156 unsigned int v; 426 if (!disable_apic_timer ||
427 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
428 return;
157 429
158 /* unmask and set to NMI */ 430 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
159 v = APIC_DM_NMI; 431 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
160 apic_write(APIC_LVT0, v); 432
433 local_irq_enable();
434 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
435 local_irq_disable();
161} 436}
162 437
163int get_maxlvt(void) 438void __cpuinit setup_secondary_APIC_clock(void)
164{ 439{
165 unsigned int v, maxlvt; 440 check_boot_apic_timer_broadcast();
441 setup_APIC_timer();
442}
166 443
167 v = apic_read(APIC_LVR); 444/*
168 maxlvt = GET_APIC_MAXLVT(v); 445 * The guts of the apic timer interrupt
169 return maxlvt; 446 */
447static void local_apic_timer_interrupt(void)
448{
449 int cpu = smp_processor_id();
450 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
451
452 /*
453 * Normally we should not be here till LAPIC has been initialized but
454 * in some cases like kdump, its possible that there is a pending LAPIC
455 * timer interrupt from previous kernel's context and is delivered in
456 * new kernel the moment interrupts are enabled.
457 *
458 * Interrupts are enabled early and LAPIC is setup much later, hence
459 * its possible that when we get here evt->event_handler is NULL.
460 * Check for event_handler being NULL and discard the interrupt as
461 * spurious.
462 */
463 if (!evt->event_handler) {
464 printk(KERN_WARNING
465 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
466 /* Switch it off */
467 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
468 return;
469 }
470
471 /*
472 * the NMI deadlock-detector uses this.
473 */
474 add_pda(apic_timer_irqs, 1);
475
476 evt->event_handler(evt);
170} 477}
171 478
172/* 479/*
173 * 'what should we do if we get a hw irq event on an illegal vector'. 480 * Local APIC timer interrupt. This is the most natural way for doing
174 * each architecture has to answer this themselves. 481 * local interrupts, but local timer interrupts can be emulated by
482 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
483 *
484 * [ if a single-CPU system runs an SMP kernel then we call the local
485 * interrupt as well. Thus we cannot inline the local irq ... ]
175 */ 486 */
176void ack_bad_irq(unsigned int irq) 487void smp_apic_timer_interrupt(struct pt_regs *regs)
177{ 488{
178 printk("unexpected IRQ trap at vector %02x\n", irq); 489 struct pt_regs *old_regs = set_irq_regs(regs);
490
179 /* 491 /*
180 * Currently unexpected vectors happen only on SMP and APIC. 492 * NOTE! We'd better ACK the irq immediately,
181 * We _must_ ack these because every local APIC has only N 493 * because timer handling can be slow.
182 * irq slots per priority level, and a 'hanging, unacked' IRQ
183 * holds up an irq slot - in excessive cases (when multiple
184 * unexpected vectors occur) that might lock up the APIC
185 * completely.
186 * But don't ack when the APIC is disabled. -AK
187 */ 494 */
188 if (!disable_apic) 495 ack_APIC_irq();
189 ack_APIC_irq(); 496 /*
497 * update_process_times() expects us to have done irq_enter().
498 * Besides, if we don't timer interrupts ignore the global
499 * interrupt lock, which is the WrongThing (tm) to do.
500 */
501 exit_idle();
502 irq_enter();
503 local_apic_timer_interrupt();
504 irq_exit();
505 set_irq_regs(old_regs);
506}
507
508int setup_profiling_timer(unsigned int multiplier)
509{
510 return -EINVAL;
190} 511}
191 512
513
514/*
515 * Local APIC start and shutdown
516 */
517
518/**
519 * clear_local_APIC - shutdown the local APIC
520 *
521 * This is called, when a CPU is disabled and before rebooting, so the state of
522 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
523 * leftovers during boot.
524 */
192void clear_local_APIC(void) 525void clear_local_APIC(void)
193{ 526{
194 int maxlvt; 527 int maxlvt = lapic_get_maxlvt();
195 unsigned int v; 528 u32 v;
196 529
197 maxlvt = get_maxlvt(); 530 /* APIC hasn't been mapped yet */
531 if (!apic_phys)
532 return;
198 533
534 maxlvt = lapic_get_maxlvt();
199 /* 535 /*
200 * Masking an LVT entry can trigger a local APIC error 536 * Masking an LVT entry can trigger a local APIC error
201 * if the vector is zero. Mask LVTERR first to prevent this. 537 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -233,45 +569,9 @@ void clear_local_APIC(void)
233 apic_read(APIC_ESR); 569 apic_read(APIC_ESR);
234} 570}
235 571
236void disconnect_bsp_APIC(int virt_wire_setup) 572/**
237{ 573 * disable_local_APIC - clear and disable the local APIC
238 /* Go back to Virtual Wire compatibility mode */ 574 */
239 unsigned long value;
240
241 /* For the spurious interrupt use vector F, and enable it */
242 value = apic_read(APIC_SPIV);
243 value &= ~APIC_VECTOR_MASK;
244 value |= APIC_SPIV_APIC_ENABLED;
245 value |= 0xf;
246 apic_write(APIC_SPIV, value);
247
248 if (!virt_wire_setup) {
249 /*
250 * For LVT0 make it edge triggered, active high,
251 * external and enabled
252 */
253 value = apic_read(APIC_LVT0);
254 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
255 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
256 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
257 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
258 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
259 apic_write(APIC_LVT0, value);
260 } else {
261 /* Disable LVT0 */
262 apic_write(APIC_LVT0, APIC_LVT_MASKED);
263 }
264
265 /* For LVT1 make it edge triggered, active high, nmi and enabled */
266 value = apic_read(APIC_LVT1);
267 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
268 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
269 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
270 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
271 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
272 apic_write(APIC_LVT1, value);
273}
274
275void disable_local_APIC(void) 575void disable_local_APIC(void)
276{ 576{
277 unsigned int value; 577 unsigned int value;
@@ -333,7 +633,7 @@ int __init verify_local_APIC(void)
333 reg1 = GET_APIC_VERSION(reg0); 633 reg1 = GET_APIC_VERSION(reg0);
334 if (reg1 == 0x00 || reg1 == 0xff) 634 if (reg1 == 0x00 || reg1 == 0xff)
335 return 0; 635 return 0;
336 reg1 = get_maxlvt(); 636 reg1 = lapic_get_maxlvt();
337 if (reg1 < 0x02 || reg1 == 0xff) 637 if (reg1 < 0x02 || reg1 == 0xff)
338 return 0; 638 return 0;
339 639
@@ -355,18 +655,20 @@ int __init verify_local_APIC(void)
355 * compatibility mode, but most boxes are anymore. 655 * compatibility mode, but most boxes are anymore.
356 */ 656 */
357 reg0 = apic_read(APIC_LVT0); 657 reg0 = apic_read(APIC_LVT0);
358 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); 658 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
359 reg1 = apic_read(APIC_LVT1); 659 reg1 = apic_read(APIC_LVT1);
360 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); 660 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
361 661
362 return 1; 662 return 1;
363} 663}
364 664
665/**
666 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
667 */
365void __init sync_Arb_IDs(void) 668void __init sync_Arb_IDs(void)
366{ 669{
367 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 670 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
368 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 671 if (modern_apic())
369 if (ver >= 0x14) /* P4 or higher */
370 return; 672 return;
371 673
372 /* 674 /*
@@ -418,9 +720,12 @@ void __init init_bsp_APIC(void)
418 apic_write(APIC_LVT1, value); 720 apic_write(APIC_LVT1, value);
419} 721}
420 722
421void __cpuinit setup_local_APIC (void) 723/**
724 * setup_local_APIC - setup the local APIC
725 */
726void __cpuinit setup_local_APIC(void)
422{ 727{
423 unsigned int value, maxlvt; 728 unsigned int value;
424 int i, j; 729 int i, j;
425 730
426 value = apic_read(APIC_LVR); 731 value = apic_read(APIC_LVR);
@@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void)
516 else 821 else
517 value = APIC_DM_NMI | APIC_LVT_MASKED; 822 value = APIC_DM_NMI | APIC_LVT_MASKED;
518 apic_write(APIC_LVT1, value); 823 apic_write(APIC_LVT1, value);
824}
519 825
520 { 826void __cpuinit lapic_setup_esr(void)
521 unsigned oldvalue; 827{
522 maxlvt = get_maxlvt(); 828 unsigned maxlvt = lapic_get_maxlvt();
523 oldvalue = apic_read(APIC_ESR); 829
524 value = ERROR_APIC_VECTOR; // enables sending errors 830 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
525 apic_write(APIC_LVTERR, value); 831 /*
526 /* 832 * spec says clear errors after enabling vector.
527 * spec says clear errors after enabling vector. 833 */
528 */ 834 if (maxlvt > 3)
529 if (maxlvt > 3) 835 apic_write(APIC_ESR, 0);
530 apic_write(APIC_ESR, 0); 836}
531 value = apic_read(APIC_ESR);
532 if (value != oldvalue)
533 apic_printk(APIC_VERBOSE,
534 "ESR value after enabling vector: %08x, after %08x\n",
535 oldvalue, value);
536 }
537 837
838void __cpuinit end_local_APIC_setup(void)
839{
840 lapic_setup_esr();
538 nmi_watchdog_default(); 841 nmi_watchdog_default();
539 setup_apic_nmi_watchdog(NULL); 842 setup_apic_nmi_watchdog(NULL);
540 apic_pm_activate(); 843 apic_pm_activate();
541} 844}
542 845
846/*
847 * Detect and enable local APICs on non-SMP boards.
848 * Original code written by Keir Fraser.
849 * On AMD64 we trust the BIOS - if it says no APIC it is likely
850 * not correctly set up (usually the APIC timer won't work etc.)
851 */
852static int __init detect_init_APIC(void)
853{
854 if (!cpu_has_apic) {
855 printk(KERN_INFO "No local APIC present\n");
856 return -1;
857 }
858
859 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
860 boot_cpu_id = 0;
861 return 0;
862}
863
864/**
865 * init_apic_mappings - initialize APIC mappings
866 */
867void __init init_apic_mappings(void)
868{
869 /*
870 * If no local APIC can be found then set up a fake all
871 * zeroes page to simulate the local APIC and another
872 * one for the IO-APIC.
873 */
874 if (!smp_found_config && detect_init_APIC()) {
875 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
876 apic_phys = __pa(apic_phys);
877 } else
878 apic_phys = mp_lapic_addr;
879
880 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
881 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
882 APIC_BASE, apic_phys);
883
884 /* Put local APIC into the resource map. */
885 lapic_resource.start = apic_phys;
886 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
887 insert_resource(&iomem_resource, &lapic_resource);
888
889 /*
890 * Fetch the APIC ID of the BSP in case we have a
891 * default configuration (or the MP table is broken).
892 */
893 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
894}
895
896/*
897 * This initializes the IO-APIC and APIC hardware if this is
898 * a UP kernel.
899 */
900int __init APIC_init_uniprocessor(void)
901{
902 if (disable_apic) {
903 printk(KERN_INFO "Apic disabled\n");
904 return -1;
905 }
906 if (!cpu_has_apic) {
907 disable_apic = 1;
908 printk(KERN_INFO "Apic disabled by BIOS\n");
909 return -1;
910 }
911
912 verify_local_APIC();
913
914 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
915 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
916
917 setup_local_APIC();
918
919 /*
920 * Now enable IO-APICs, actually call clear_IO_APIC
921 * We need clear_IO_APIC before enabling vector on BP
922 */
923 if (!skip_ioapic_setup && nr_ioapics)
924 enable_IO_APIC();
925
926 end_local_APIC_setup();
927
928 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
929 setup_IO_APIC();
930 else
931 nr_ioapics = 0;
932 setup_boot_APIC_clock();
933 check_nmi_watchdog();
934 return 0;
935}
936
937/*
938 * Local APIC interrupts
939 */
940
941/*
942 * This interrupt should _never_ happen with our APIC/SMP architecture
943 */
944asmlinkage void smp_spurious_interrupt(void)
945{
946 unsigned int v;
947 exit_idle();
948 irq_enter();
949 /*
950 * Check if this really is a spurious interrupt and ACK it
951 * if it is a vectored one. Just in case...
952 * Spurious interrupts should not be ACKed.
953 */
954 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
955 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
956 ack_APIC_irq();
957
958 add_pda(irq_spurious_count, 1);
959 irq_exit();
960}
961
962/*
963 * This interrupt should never happen with our APIC/SMP architecture
964 */
965asmlinkage void smp_error_interrupt(void)
966{
967 unsigned int v, v1;
968
969 exit_idle();
970 irq_enter();
971 /* First tickle the hardware, only then report what went on. -- REW */
972 v = apic_read(APIC_ESR);
973 apic_write(APIC_ESR, 0);
974 v1 = apic_read(APIC_ESR);
975 ack_APIC_irq();
976 atomic_inc(&irq_err_count);
977
978 /* Here is what the APIC error bits mean:
979 0: Send CS error
980 1: Receive CS error
981 2: Send accept error
982 3: Receive accept error
983 4: Reserved
984 5: Send illegal vector
985 6: Received illegal vector
986 7: Illegal register address
987 */
988 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
989 smp_processor_id(), v , v1);
990 irq_exit();
991}
992
993void disconnect_bsp_APIC(int virt_wire_setup)
994{
995 /* Go back to Virtual Wire compatibility mode */
996 unsigned long value;
997
998 /* For the spurious interrupt use vector F, and enable it */
999 value = apic_read(APIC_SPIV);
1000 value &= ~APIC_VECTOR_MASK;
1001 value |= APIC_SPIV_APIC_ENABLED;
1002 value |= 0xf;
1003 apic_write(APIC_SPIV, value);
1004
1005 if (!virt_wire_setup) {
1006 /*
1007 * For LVT0 make it edge triggered, active high,
1008 * external and enabled
1009 */
1010 value = apic_read(APIC_LVT0);
1011 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1012 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1013 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1014 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1015 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1016 apic_write(APIC_LVT0, value);
1017 } else {
1018 /* Disable LVT0 */
1019 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1020 }
1021
1022 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1023 value = apic_read(APIC_LVT1);
1024 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1025 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1026 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1027 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1028 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1029 apic_write(APIC_LVT1, value);
1030}
1031
1032/*
1033 * Power management
1034 */
543#ifdef CONFIG_PM 1035#ifdef CONFIG_PM
544 1036
545static struct { 1037static struct {
@@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
571 if (!apic_pm_state.active) 1063 if (!apic_pm_state.active)
572 return 0; 1064 return 0;
573 1065
574 maxlvt = get_maxlvt(); 1066 maxlvt = lapic_get_maxlvt();
575 1067
576 apic_pm_state.apic_id = apic_read(APIC_ID); 1068 apic_pm_state.apic_id = apic_read(APIC_ID);
577 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1069 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
@@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev)
605 if (!apic_pm_state.active) 1097 if (!apic_pm_state.active)
606 return 0; 1098 return 0;
607 1099
608 maxlvt = get_maxlvt(); 1100 maxlvt = lapic_get_maxlvt();
609 1101
610 local_irq_save(flags); 1102 local_irq_save(flags);
611 rdmsr(MSR_IA32_APICBASE, l, h); 1103 rdmsr(MSR_IA32_APICBASE, l, h);
@@ -645,8 +1137,8 @@ static struct sysdev_class lapic_sysclass = {
645}; 1137};
646 1138
647static struct sys_device device_lapic = { 1139static struct sys_device device_lapic = {
648 .id = 0, 1140 .id = 0,
649 .cls = &lapic_sysclass, 1141 .cls = &lapic_sysclass,
650}; 1142};
651 1143
652static void __cpuinit apic_pm_activate(void) 1144static void __cpuinit apic_pm_activate(void)
@@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void)
657static int __init init_lapic_sysfs(void) 1149static int __init init_lapic_sysfs(void)
658{ 1150{
659 int error; 1151 int error;
1152
660 if (!cpu_has_apic) 1153 if (!cpu_has_apic)
661 return 0; 1154 return 0;
662 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 1155 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1156
663 error = sysdev_class_register(&lapic_sysclass); 1157 error = sysdev_class_register(&lapic_sysclass);
664 if (!error) 1158 if (!error)
665 error = sysdev_register(&device_lapic); 1159 error = sysdev_register(&device_lapic);
@@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { }
673 1167
674#endif /* CONFIG_PM */ 1168#endif /* CONFIG_PM */
675 1169
676static int __init apic_set_verbosity(char *str)
677{
678 if (str == NULL) {
679 skip_ioapic_setup = 0;
680 ioapic_force = 1;
681 return 0;
682 }
683 if (strcmp("debug", str) == 0)
684 apic_verbosity = APIC_DEBUG;
685 else if (strcmp("verbose", str) == 0)
686 apic_verbosity = APIC_VERBOSE;
687 else {
688 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
689 " use apic=verbose or apic=debug\n", str);
690 return -EINVAL;
691 }
692
693 return 0;
694}
695early_param("apic", apic_set_verbosity);
696
697/*
698 * Detect and enable local APICs on non-SMP boards.
699 * Original code written by Keir Fraser.
700 * On AMD64 we trust the BIOS - if it says no APIC it is likely
701 * not correctly set up (usually the APIC timer won't work etc.)
702 */
703
704static int __init detect_init_APIC (void)
705{
706 if (!cpu_has_apic) {
707 printk(KERN_INFO "No local APIC present\n");
708 return -1;
709 }
710
711 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
712 boot_cpu_id = 0;
713 return 0;
714}
715
716#ifdef CONFIG_X86_IO_APIC
717static struct resource * __init ioapic_setup_resources(void)
718{
719#define IOAPIC_RESOURCE_NAME_SIZE 11
720 unsigned long n;
721 struct resource *res;
722 char *mem;
723 int i;
724
725 if (nr_ioapics <= 0)
726 return NULL;
727
728 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
729 n *= nr_ioapics;
730
731 mem = alloc_bootmem(n);
732 res = (void *)mem;
733
734 if (mem != NULL) {
735 memset(mem, 0, n);
736 mem += sizeof(struct resource) * nr_ioapics;
737
738 for (i = 0; i < nr_ioapics; i++) {
739 res[i].name = mem;
740 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
741 sprintf(mem, "IOAPIC %u", i);
742 mem += IOAPIC_RESOURCE_NAME_SIZE;
743 }
744 }
745
746 ioapic_resources = res;
747
748 return res;
749}
750
751static int __init ioapic_insert_resources(void)
752{
753 int i;
754 struct resource *r = ioapic_resources;
755
756 if (!r) {
757 printk("IO APIC resources could be not be allocated.\n");
758 return -1;
759 }
760
761 for (i = 0; i < nr_ioapics; i++) {
762 insert_resource(&iomem_resource, r);
763 r++;
764 }
765
766 return 0;
767}
768
769/* Insert the IO APIC resources after PCI initialization has occured to handle
770 * IO APICS that are mapped in on a BAR in PCI space. */
771late_initcall(ioapic_insert_resources);
772#endif
773
774void __init init_apic_mappings(void)
775{
776 unsigned long apic_phys;
777
778 /*
779 * If no local APIC can be found then set up a fake all
780 * zeroes page to simulate the local APIC and another
781 * one for the IO-APIC.
782 */
783 if (!smp_found_config && detect_init_APIC()) {
784 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
785 apic_phys = __pa(apic_phys);
786 } else
787 apic_phys = mp_lapic_addr;
788
789 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
790 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
791 APIC_BASE, apic_phys);
792
793 /* Put local APIC into the resource map. */
794 lapic_resource.start = apic_phys;
795 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
796 insert_resource(&iomem_resource, &lapic_resource);
797
798 /*
799 * Fetch the APIC ID of the BSP in case we have a
800 * default configuration (or the MP table is broken).
801 */
802 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
803
804 {
805 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
806 int i;
807 struct resource *ioapic_res;
808
809 ioapic_res = ioapic_setup_resources();
810 for (i = 0; i < nr_ioapics; i++) {
811 if (smp_found_config) {
812 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
813 } else {
814 ioapic_phys = (unsigned long)
815 alloc_bootmem_pages(PAGE_SIZE);
816 ioapic_phys = __pa(ioapic_phys);
817 }
818 set_fixmap_nocache(idx, ioapic_phys);
819 apic_printk(APIC_VERBOSE,
820 "mapped IOAPIC to %016lx (%016lx)\n",
821 __fix_to_virt(idx), ioapic_phys);
822 idx++;
823
824 if (ioapic_res != NULL) {
825 ioapic_res->start = ioapic_phys;
826 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
827 ioapic_res++;
828 }
829 }
830 }
831}
832
833/*
834 * This function sets up the local APIC timer, with a timeout of
835 * 'clocks' APIC bus clock. During calibration we actually call
836 * this function twice on the boot CPU, once with a bogus timeout
837 * value, second time for real. The other (noncalibrating) CPUs
838 * call this function only once, with the real, calibrated value.
839 *
840 * We do reads before writes even if unnecessary, to get around the
841 * P5 APIC double write bug.
842 */
843
844static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
845{
846 unsigned int lvtt_value, tmp_value;
847
848 lvtt_value = LOCAL_TIMER_VECTOR;
849 if (!oneshot)
850 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
851 if (!irqen)
852 lvtt_value |= APIC_LVT_MASKED;
853
854 apic_write(APIC_LVTT, lvtt_value);
855
856 /*
857 * Divide PICLK by 16
858 */
859 tmp_value = apic_read(APIC_TDCR);
860 apic_write(APIC_TDCR, (tmp_value
861 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
862 | APIC_TDR_DIV_16);
863
864 if (!oneshot)
865 apic_write(APIC_TMICT, clocks);
866}
867
868static void setup_APIC_timer(void)
869{
870 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
871
872 memcpy(levt, &lapic_clockevent, sizeof(*levt));
873 levt->cpumask = cpumask_of_cpu(smp_processor_id());
874
875 clockevents_register_device(levt);
876}
877
878/*
879 * In this function we calibrate APIC bus clocks to the external
880 * timer. Unfortunately we cannot use jiffies and the timer irq
881 * to calibrate, since some later bootup code depends on getting
882 * the first irq? Ugh.
883 *
884 * We want to do the calibration only once since we
885 * want to have local timer irqs syncron. CPUs connected
886 * by the same APIC bus have the very same bus frequency.
887 * And we want to have irqs off anyways, no accidental
888 * APIC irq that way.
889 */
890
891#define TICK_COUNT 100000000
892
893static void __init calibrate_APIC_clock(void)
894{
895 unsigned apic, apic_start;
896 unsigned long tsc, tsc_start;
897 int result;
898
899 local_irq_disable();
900
901 /*
902 * Put whatever arbitrary (but long enough) timeout
903 * value into the APIC clock, we just want to get the
904 * counter running for calibration.
905 *
906 * No interrupt enable !
907 */
908 __setup_APIC_LVTT(250000000, 0, 0);
909
910 apic_start = apic_read(APIC_TMCCT);
911#ifdef CONFIG_X86_PM_TIMER
912 if (apic_calibrate_pmtmr && pmtmr_ioport) {
913 pmtimer_wait(5000); /* 5ms wait */
914 apic = apic_read(APIC_TMCCT);
915 result = (apic_start - apic) * 1000L / 5;
916 } else
917#endif
918 {
919 rdtscll(tsc_start);
920
921 do {
922 apic = apic_read(APIC_TMCCT);
923 rdtscll(tsc);
924 } while ((tsc - tsc_start) < TICK_COUNT &&
925 (apic_start - apic) < TICK_COUNT);
926
927 result = (apic_start - apic) * 1000L * tsc_khz /
928 (tsc - tsc_start);
929 }
930
931 local_irq_enable();
932
933 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
934
935 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
936 result / 1000 / 1000, result / 1000 % 1000);
937
938 /* Calculate the scaled math multiplication factor */
939 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
940 lapic_clockevent.max_delta_ns =
941 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
942 lapic_clockevent.min_delta_ns =
943 clockevent_delta2ns(0xF, &lapic_clockevent);
944
945 calibration_result = result / HZ;
946}
947
948void __init setup_boot_APIC_clock (void)
949{
950 /*
951 * The local apic timer can be disabled via the kernel commandline.
952 * Register the lapic timer as a dummy clock event source on SMP
953 * systems, so the broadcast mechanism is used. On UP systems simply
954 * ignore it.
955 */
956 if (disable_apic_timer) {
957 printk(KERN_INFO "Disabling APIC timer\n");
958 /* No broadcast on UP ! */
959 if (num_possible_cpus() > 1)
960 setup_APIC_timer();
961 return;
962 }
963
964 printk(KERN_INFO "Using local APIC timer interrupts.\n");
965 calibrate_APIC_clock();
966
967 /*
968 * If nmi_watchdog is set to IO_APIC, we need the
969 * PIT/HPET going. Otherwise register lapic as a dummy
970 * device.
971 */
972 if (nmi_watchdog != NMI_IO_APIC)
973 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
974 else
975 printk(KERN_WARNING "APIC timer registered as dummy,"
976 " due to nmi_watchdog=1!\n");
977
978 setup_APIC_timer();
979}
980
981/*
982 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
983 * C1E flag only in the secondary CPU, so when we detect the wreckage
984 * we already have enabled the boot CPU local apic timer. Check, if
985 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
986 * set the DUMMY flag again and force the broadcast mode in the
987 * clockevents layer.
988 */
989void __cpuinit check_boot_apic_timer_broadcast(void)
990{
991 if (!disable_apic_timer ||
992 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
993 return;
994
995 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
996 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
997
998 local_irq_enable();
999 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
1000 local_irq_disable();
1001}
1002
1003void __cpuinit setup_secondary_APIC_clock(void)
1004{
1005 check_boot_apic_timer_broadcast();
1006 setup_APIC_timer();
1007}
1008
1009int setup_profiling_timer(unsigned int multiplier)
1010{
1011 return -EINVAL;
1012}
1013
1014void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
1015 unsigned char msg_type, unsigned char mask)
1016{
1017 unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
1018 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
1019 apic_write(reg, v);
1020}
1021
1022/*
1023 * Local timer interrupt handler. It does both profiling and
1024 * process statistics/rescheduling.
1025 *
1026 * We do profiling in every local tick, statistics/rescheduling
1027 * happen only every 'profiling multiplier' ticks. The default
1028 * multiplier is 1 and it can be changed by writing the new multiplier
1029 * value into /proc/profile.
1030 */
1031
1032void smp_local_timer_interrupt(void)
1033{
1034 int cpu = smp_processor_id();
1035 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
1036
1037 /*
1038 * Normally we should not be here till LAPIC has been initialized but
1039 * in some cases like kdump, its possible that there is a pending LAPIC
1040 * timer interrupt from previous kernel's context and is delivered in
1041 * new kernel the moment interrupts are enabled.
1042 *
1043 * Interrupts are enabled early and LAPIC is setup much later, hence
1044 * its possible that when we get here evt->event_handler is NULL.
1045 * Check for event_handler being NULL and discard the interrupt as
1046 * spurious.
1047 */
1048 if (!evt->event_handler) {
1049 printk(KERN_WARNING
1050 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
1051 /* Switch it off */
1052 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
1053 return;
1054 }
1055
1056 /*
1057 * the NMI deadlock-detector uses this.
1058 */
1059 add_pda(apic_timer_irqs, 1);
1060
1061 evt->event_handler(evt);
1062}
1063
1064/*
1065 * Local APIC timer interrupt. This is the most natural way for doing
1066 * local interrupts, but local timer interrupts can be emulated by
1067 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1068 *
1069 * [ if a single-CPU system runs an SMP kernel then we call the local
1070 * interrupt as well. Thus we cannot inline the local irq ... ]
1071 */
1072void smp_apic_timer_interrupt(struct pt_regs *regs)
1073{
1074 struct pt_regs *old_regs = set_irq_regs(regs);
1075
1076 /*
1077 * NOTE! We'd better ACK the irq immediately,
1078 * because timer handling can be slow.
1079 */
1080 ack_APIC_irq();
1081 /*
1082 * update_process_times() expects us to have done irq_enter().
1083 * Besides, if we don't timer interrupts ignore the global
1084 * interrupt lock, which is the WrongThing (tm) to do.
1085 */
1086 exit_idle();
1087 irq_enter();
1088 smp_local_timer_interrupt();
1089 irq_exit();
1090 set_irq_regs(old_regs);
1091}
1092
1093/* 1170/*
1094 * apic_is_clustered_box() -- Check if we can expect good TSC 1171 * apic_is_clustered_box() -- Check if we can expect good TSC
1095 * 1172 *
@@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void)
1103{ 1180{
1104 int i, clusters, zeros; 1181 int i, clusters, zeros;
1105 unsigned id; 1182 unsigned id;
1183 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
1106 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 1184 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1107 1185
1108 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1186 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1109 1187
1110 for (i = 0; i < NR_CPUS; i++) { 1188 for (i = 0; i < NR_CPUS; i++) {
1111 id = bios_cpu_apicid[i]; 1189 /* are we being called early in kernel startup? */
1190 if (bios_cpu_apicid) {
1191 id = bios_cpu_apicid[i];
1192 }
1193 else if (i < nr_cpu_ids) {
1194 if (cpu_present(i))
1195 id = per_cpu(x86_bios_cpu_apicid, i);
1196 else
1197 continue;
1198 }
1199 else
1200 break;
1201
1112 if (id != BAD_APICID) 1202 if (id != BAD_APICID)
1113 __set_bit(APIC_CLUSTERID(id), clustermap); 1203 __set_bit(APIC_CLUSTERID(id), clustermap);
1114 } 1204 }
1115 1205
1116 /* Problem: Partially populated chassis may not have CPUs in some of 1206 /* Problem: Partially populated chassis may not have CPUs in some of
1117 * the APIC clusters they have been allocated. Only present CPUs have 1207 * the APIC clusters they have been allocated. Only present CPUs have
1118 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since 1208 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1119 * clusters are allocated sequentially, count zeros only if they are 1209 * Since clusters are allocated sequentially, count zeros only if
1120 * bounded by ones. 1210 * they are bounded by ones.
1121 */ 1211 */
1122 clusters = 0; 1212 clusters = 0;
1123 zeros = 0; 1213 zeros = 0;
@@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void)
1138} 1228}
1139 1229
1140/* 1230/*
1141 * This interrupt should _never_ happen with our APIC/SMP architecture 1231 * APIC command line parameters
1142 */
1143asmlinkage void smp_spurious_interrupt(void)
1144{
1145 unsigned int v;
1146 exit_idle();
1147 irq_enter();
1148 /*
1149 * Check if this really is a spurious interrupt and ACK it
1150 * if it is a vectored one. Just in case...
1151 * Spurious interrupts should not be ACKed.
1152 */
1153 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1154 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1155 ack_APIC_irq();
1156
1157 add_pda(irq_spurious_count, 1);
1158 irq_exit();
1159}
1160
1161/*
1162 * This interrupt should never happen with our APIC/SMP architecture
1163 */ 1232 */
1164 1233static int __init apic_set_verbosity(char *str)
1165asmlinkage void smp_error_interrupt(void)
1166{
1167 unsigned int v, v1;
1168
1169 exit_idle();
1170 irq_enter();
1171 /* First tickle the hardware, only then report what went on. -- REW */
1172 v = apic_read(APIC_ESR);
1173 apic_write(APIC_ESR, 0);
1174 v1 = apic_read(APIC_ESR);
1175 ack_APIC_irq();
1176 atomic_inc(&irq_err_count);
1177
1178 /* Here is what the APIC error bits mean:
1179 0: Send CS error
1180 1: Receive CS error
1181 2: Send accept error
1182 3: Receive accept error
1183 4: Reserved
1184 5: Send illegal vector
1185 6: Received illegal vector
1186 7: Illegal register address
1187 */
1188 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1189 smp_processor_id(), v , v1);
1190 irq_exit();
1191}
1192
1193int disable_apic;
1194
1195/*
1196 * This initializes the IO-APIC and APIC hardware if this is
1197 * a UP kernel.
1198 */
1199int __init APIC_init_uniprocessor (void)
1200{ 1234{
1201 if (disable_apic) { 1235 if (str == NULL) {
1202 printk(KERN_INFO "Apic disabled\n"); 1236 skip_ioapic_setup = 0;
1203 return -1; 1237 ioapic_force = 1;
1238 return 0;
1204 } 1239 }
1205 if (!cpu_has_apic) { 1240 if (strcmp("debug", str) == 0)
1206 disable_apic = 1; 1241 apic_verbosity = APIC_DEBUG;
1207 printk(KERN_INFO "Apic disabled by BIOS\n"); 1242 else if (strcmp("verbose", str) == 0)
1208 return -1; 1243 apic_verbosity = APIC_VERBOSE;
1244 else {
1245 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1246 " use apic=verbose or apic=debug\n", str);
1247 return -EINVAL;
1209 } 1248 }
1210 1249
1211 verify_local_APIC();
1212
1213 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1214 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1215
1216 setup_local_APIC();
1217
1218 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1219 setup_IO_APIC();
1220 else
1221 nr_ioapics = 0;
1222 setup_boot_APIC_clock();
1223 check_nmi_watchdog();
1224 return 0; 1250 return 0;
1225} 1251}
1252early_param("apic", apic_set_verbosity);
1226 1253
1227static __init int setup_disableapic(char *str) 1254static __init int setup_disableapic(char *str)
1228{ 1255{
1229 disable_apic = 1; 1256 disable_apic = 1;
1230 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1257 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1231 return 0; 1258 return 0;
1232} 1259}
1233early_param("disableapic", setup_disableapic); 1260early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index af045ca0f653..d4438ef296d8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
227#include <linux/dmi.h> 227#include <linux/dmi.h>
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -235,8 +236,6 @@
235#include <asm/paravirt.h> 236#include <asm/paravirt.h>
236#include <asm/reboot.h> 237#include <asm/reboot.h>
237 238
238#include "io_ports.h"
239
240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 239#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
241extern int (*console_blank_hook)(int); 240extern int (*console_blank_hook)(int);
242#endif 241#endif
@@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int);
324/* 323/*
325 * Ignore suspend events for this amount of time after a resume 324 * Ignore suspend events for this amount of time after a resume
326 */ 325 */
327#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) 326#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
328 327
329/* 328/*
330 * Maximum number of events stored 329 * Maximum number of events stored
@@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int);
336 */ 335 */
337struct apm_user { 336struct apm_user {
338 int magic; 337 int magic;
339 struct apm_user * next; 338 struct apm_user *next;
340 unsigned int suser: 1; 339 unsigned int suser: 1;
341 unsigned int writer: 1; 340 unsigned int writer: 1;
342 unsigned int reader: 1; 341 unsigned int reader: 1;
@@ -372,44 +371,44 @@ struct apm_user {
372static struct { 371static struct {
373 unsigned long offset; 372 unsigned long offset;
374 unsigned short segment; 373 unsigned short segment;
375} apm_bios_entry; 374} apm_bios_entry;
376static int clock_slowed; 375static int clock_slowed;
377static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; 376static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
378static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; 377static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
379static int set_pm_idle; 378static int set_pm_idle;
380static int suspends_pending; 379static int suspends_pending;
381static int standbys_pending; 380static int standbys_pending;
382static int ignore_sys_suspend; 381static int ignore_sys_suspend;
383static int ignore_normal_resume; 382static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 383static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 384
386static int debug __read_mostly; 385static int debug __read_mostly;
387static int smp __read_mostly; 386static int smp __read_mostly;
388static int apm_disabled = -1; 387static int apm_disabled = -1;
389#ifdef CONFIG_SMP 388#ifdef CONFIG_SMP
390static int power_off; 389static int power_off;
391#else 390#else
392static int power_off = 1; 391static int power_off = 1;
393#endif 392#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF 393#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1; 394static int realmode_power_off = 1;
396#else 395#else
397static int realmode_power_off; 396static int realmode_power_off;
398#endif 397#endif
399#ifdef CONFIG_APM_ALLOW_INTS 398#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 399static int allow_ints = 1;
401#else 400#else
402static int allow_ints; 401static int allow_ints;
403#endif 402#endif
404static int broken_psr; 403static int broken_psr;
405 404
406static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 405static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
407static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 406static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
408static struct apm_user * user_list; 407static struct apm_user *user_list;
409static DEFINE_SPINLOCK(user_list_lock); 408static DEFINE_SPINLOCK(user_list_lock);
410static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; 409static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } };
411 410
412static const char driver_version[] = "1.16ac"; /* no spaces */ 411static const char driver_version[] = "1.16ac"; /* no spaces */
413 412
414static struct task_struct *kapmd_task; 413static struct task_struct *kapmd_task;
415 414
@@ -417,7 +416,7 @@ static struct task_struct *kapmd_task;
417 * APM event names taken from the APM 1.2 specification. These are 416 * APM event names taken from the APM 1.2 specification. These are
418 * the message codes that the BIOS uses to tell us about events 417 * the message codes that the BIOS uses to tell us about events
419 */ 418 */
420static const char * const apm_event_name[] = { 419static const char * const apm_event_name[] = {
421 "system standby", 420 "system standby",
422 "system suspend", 421 "system suspend",
423 "normal resume", 422 "normal resume",
@@ -435,14 +434,14 @@ static const char * const apm_event_name[] = {
435 434
436typedef struct lookup_t { 435typedef struct lookup_t {
437 int key; 436 int key;
438 char * msg; 437 char *msg;
439} lookup_t; 438} lookup_t;
440 439
441/* 440/*
442 * The BIOS returns a set of standard error codes in AX when the 441 * The BIOS returns a set of standard error codes in AX when the
443 * carry flag is set. 442 * carry flag is set.
444 */ 443 */
445 444
446static const lookup_t error_table[] = { 445static const lookup_t error_table[] = {
447/* N/A { APM_SUCCESS, "Operation succeeded" }, */ 446/* N/A { APM_SUCCESS, "Operation succeeded" }, */
448 { APM_DISABLED, "Power management disabled" }, 447 { APM_DISABLED, "Power management disabled" },
@@ -472,24 +471,25 @@ static const lookup_t error_table[] = {
472 * Write a meaningful log entry to the kernel log in the event of 471 * Write a meaningful log entry to the kernel log in the event of
473 * an APM error. 472 * an APM error.
474 */ 473 */
475 474
476static void apm_error(char *str, int err) 475static void apm_error(char *str, int err)
477{ 476{
478 int i; 477 int i;
479 478
480 for (i = 0; i < ERROR_COUNT; i++) 479 for (i = 0; i < ERROR_COUNT; i++)
481 if (error_table[i].key == err) break; 480 if (error_table[i].key == err)
481 break;
482 if (i < ERROR_COUNT) 482 if (i < ERROR_COUNT)
483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
484 else 484 else
485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
486 str, err); 486 str, err);
487} 487}
488 488
489/* 489/*
490 * Lock APM functionality to physical CPU 0 490 * Lock APM functionality to physical CPU 0
491 */ 491 */
492 492
493#ifdef CONFIG_SMP 493#ifdef CONFIG_SMP
494 494
495static cpumask_t apm_save_cpus(void) 495static cpumask_t apm_save_cpus(void)
@@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask)
511/* 511/*
512 * No CPU lockdown needed on a uniprocessor 512 * No CPU lockdown needed on a uniprocessor
513 */ 513 */
514 514
515#define apm_save_cpus() (current->cpus_allowed) 515#define apm_save_cpus() (current->cpus_allowed)
516#define apm_restore_cpus(x) (void)(x) 516#define apm_restore_cpus(x) (void)(x)
517 517
@@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags)
590 * code is returned in AH (bits 8-15 of eax) and this function 590 * code is returned in AH (bits 8-15 of eax) and this function
591 * returns non-zero. 591 * returns non-zero.
592 */ 592 */
593 593
594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, 594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) 595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
596{ 596{
@@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
602 struct desc_struct *gdt; 602 struct desc_struct *gdt;
603 603
604 cpus = apm_save_cpus(); 604 cpus = apm_save_cpus();
605 605
606 cpu = get_cpu(); 606 cpu = get_cpu();
607 gdt = get_cpu_gdt_table(cpu); 607 gdt = get_cpu_gdt_table(cpu);
608 save_desc_40 = gdt[0x40 / 8]; 608 save_desc_40 = gdt[0x40 / 8];
@@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
616 gdt[0x40 / 8] = save_desc_40; 616 gdt[0x40 / 8] = save_desc_40;
617 put_cpu(); 617 put_cpu();
618 apm_restore_cpus(cpus); 618 apm_restore_cpus(cpus);
619 619
620 return *eax & 0xff; 620 return *eax & 0xff;
621} 621}
622 622
@@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
645 struct desc_struct *gdt; 645 struct desc_struct *gdt;
646 646
647 cpus = apm_save_cpus(); 647 cpus = apm_save_cpus();
648 648
649 cpu = get_cpu(); 649 cpu = get_cpu();
650 gdt = get_cpu_gdt_table(cpu); 650 gdt = get_cpu_gdt_table(cpu);
651 save_desc_40 = gdt[0x40 / 8]; 651 save_desc_40 = gdt[0x40 / 8];
@@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
680 680
681static int apm_driver_version(u_short *val) 681static int apm_driver_version(u_short *val)
682{ 682{
683 u32 eax; 683 u32 eax;
684 684
685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) 685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
686 return (eax >> 8) & 0xff; 686 return (eax >> 8) & 0xff;
@@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val)
704 * that APM 1.2 is in use. If no messges are pending the value 0x80 704 * that APM 1.2 is in use. If no messges are pending the value 0x80
705 * is returned (No power management events pending). 705 * is returned (No power management events pending).
706 */ 706 */
707 707
708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) 708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
709{ 709{
710 u32 eax; 710 u32 eax;
711 u32 ebx; 711 u32 ebx;
712 u32 ecx; 712 u32 ecx;
713 u32 dummy; 713 u32 dummy;
714 714
715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, 715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
716 &dummy, &dummy)) 716 &dummy, &dummy))
717 return (eax >> 8) & 0xff; 717 return (eax >> 8) & 0xff;
718 *event = ebx; 718 *event = ebx;
719 if (apm_info.connection_version < 0x0102) 719 if (apm_info.connection_version < 0x0102)
@@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
736 * The state holds the state to transition to, which may in fact 736 * The state holds the state to transition to, which may in fact
737 * be an acceptance of a BIOS requested state change. 737 * be an acceptance of a BIOS requested state change.
738 */ 738 */
739 739
740static int set_power_state(u_short what, u_short state) 740static int set_power_state(u_short what, u_short state)
741{ 741{
742 u32 eax; 742 u32 eax;
743 743
744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) 744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
745 return (eax >> 8) & 0xff; 745 return (eax >> 8) & 0xff;
@@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state)
752 * 752 *
753 * Transition the entire system into a new APM power state. 753 * Transition the entire system into a new APM power state.
754 */ 754 */
755 755
756static int set_system_power_state(u_short state) 756static int set_system_power_state(u_short state)
757{ 757{
758 return set_power_state(APM_DEVICE_ALL, state); 758 return set_power_state(APM_DEVICE_ALL, state);
@@ -766,13 +766,13 @@ static int set_system_power_state(u_short state)
766 * to handle the idle request. On a success the function returns 1 766 * to handle the idle request. On a success the function returns 1
767 * if the BIOS did clock slowing or 0 otherwise. 767 * if the BIOS did clock slowing or 0 otherwise.
768 */ 768 */
769 769
770static int apm_do_idle(void) 770static int apm_do_idle(void)
771{ 771{
772 u32 eax; 772 u32 eax;
773 u8 ret = 0; 773 u8 ret = 0;
774 int idled = 0; 774 int idled = 0;
775 int polling; 775 int polling;
776 776
777 polling = !!(current_thread_info()->status & TS_POLLING); 777 polling = !!(current_thread_info()->status & TS_POLLING);
778 if (polling) { 778 if (polling) {
@@ -799,10 +799,9 @@ static int apm_do_idle(void)
799 /* This always fails on some SMP boards running UP kernels. 799 /* This always fails on some SMP boards running UP kernels.
800 * Only report the failure the first 5 times. 800 * Only report the failure the first 5 times.
801 */ 801 */
802 if (++t < 5) 802 if (++t < 5) {
803 {
804 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", 803 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
805 (eax >> 8) & 0xff); 804 (eax >> 8) & 0xff);
806 t = jiffies; 805 t = jiffies;
807 } 806 }
808 return -1; 807 return -1;
@@ -814,15 +813,15 @@ static int apm_do_idle(void)
814/** 813/**
815 * apm_do_busy - inform the BIOS the CPU is busy 814 * apm_do_busy - inform the BIOS the CPU is busy
816 * 815 *
817 * Request that the BIOS brings the CPU back to full performance. 816 * Request that the BIOS brings the CPU back to full performance.
818 */ 817 */
819 818
820static void apm_do_busy(void) 819static void apm_do_busy(void)
821{ 820{
822 u32 dummy; 821 u32 dummy;
823 822
824 if (clock_slowed || ALWAYS_CALL_BUSY) { 823 if (clock_slowed || ALWAYS_CALL_BUSY) {
825 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); 824 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
826 clock_slowed = 0; 825 clock_slowed = 0;
827 } 826 }
828} 827}
@@ -833,15 +832,15 @@ static void apm_do_busy(void)
833 * power management - we probably want 832 * power management - we probably want
834 * to conserve power. 833 * to conserve power.
835 */ 834 */
836#define IDLE_CALC_LIMIT (HZ * 100) 835#define IDLE_CALC_LIMIT (HZ * 100)
837#define IDLE_LEAKY_MAX 16 836#define IDLE_LEAKY_MAX 16
838 837
839static void (*original_pm_idle)(void) __read_mostly; 838static void (*original_pm_idle)(void) __read_mostly;
840 839
841/** 840/**
842 * apm_cpu_idle - cpu idling for APM capable Linux 841 * apm_cpu_idle - cpu idling for APM capable Linux
843 * 842 *
844 * This is the idling function the kernel executes when APM is available. It 843 * This is the idling function the kernel executes when APM is available. It
845 * tries to do BIOS powermanagement based on the average system idle time. 844 * tries to do BIOS powermanagement based on the average system idle time.
846 * Furthermore it calls the system default idle routine. 845 * Furthermore it calls the system default idle routine.
847 */ 846 */
@@ -882,7 +881,8 @@ recalc:
882 881
883 t = jiffies; 882 t = jiffies;
884 switch (apm_do_idle()) { 883 switch (apm_do_idle()) {
885 case 0: apm_idle_done = 1; 884 case 0:
885 apm_idle_done = 1;
886 if (t != jiffies) { 886 if (t != jiffies) {
887 if (bucket) { 887 if (bucket) {
888 bucket = IDLE_LEAKY_MAX; 888 bucket = IDLE_LEAKY_MAX;
@@ -893,7 +893,8 @@ recalc:
893 continue; 893 continue;
894 } 894 }
895 break; 895 break;
896 case 1: apm_idle_done = 1; 896 case 1:
897 apm_idle_done = 1;
897 break; 898 break;
898 default: /* BIOS refused */ 899 default: /* BIOS refused */
899 break; 900 break;
@@ -921,10 +922,10 @@ recalc:
921 * the SMP call on CPU0 as some systems will only honour this call 922 * the SMP call on CPU0 as some systems will only honour this call
922 * on their first cpu. 923 * on their first cpu.
923 */ 924 */
924 925
925static void apm_power_off(void) 926static void apm_power_off(void)
926{ 927{
927 unsigned char po_bios_call[] = { 928 unsigned char po_bios_call[] = {
928 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 929 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
929 0x8e, 0xd0, /* movw ax,ss */ 930 0x8e, 0xd0, /* movw ax,ss */
930 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ 931 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
@@ -935,13 +936,12 @@ static void apm_power_off(void)
935 }; 936 };
936 937
937 /* Some bioses don't like being called from CPU != 0 */ 938 /* Some bioses don't like being called from CPU != 0 */
938 if (apm_info.realmode_power_off) 939 if (apm_info.realmode_power_off) {
939 {
940 (void)apm_save_cpus(); 940 (void)apm_save_cpus();
941 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 941 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 } else {
943 (void)set_system_power_state(APM_STATE_OFF);
942 } 944 }
943 else
944 (void) set_system_power_state(APM_STATE_OFF);
945} 945}
946 946
947#ifdef CONFIG_APM_DO_ENABLE 947#ifdef CONFIG_APM_DO_ENABLE
@@ -950,17 +950,17 @@ static void apm_power_off(void)
950 * apm_enable_power_management - enable BIOS APM power management 950 * apm_enable_power_management - enable BIOS APM power management
951 * @enable: enable yes/no 951 * @enable: enable yes/no
952 * 952 *
953 * Enable or disable the APM BIOS power services. 953 * Enable or disable the APM BIOS power services.
954 */ 954 */
955 955
956static int apm_enable_power_management(int enable) 956static int apm_enable_power_management(int enable)
957{ 957{
958 u32 eax; 958 u32 eax;
959 959
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) 960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED; 961 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, 962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax)) 963 enable, &eax))
964 return (eax >> 8) & 0xff; 964 return (eax >> 8) & 0xff;
965 if (enable) 965 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED; 966 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
@@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable)
983 * if reported is a lifetime in secodnds/minutes at current powwer 983 * if reported is a lifetime in secodnds/minutes at current powwer
984 * consumption. 984 * consumption.
985 */ 985 */
986 986
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) 987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{ 988{
989 u32 eax; 989 u32 eax;
990 u32 ebx; 990 u32 ebx;
991 u32 ecx; 991 u32 ecx;
992 u32 edx; 992 u32 edx;
993 u32 dummy; 993 u32 dummy;
994 994
995 if (apm_info.get_power_status_broken) 995 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED; 996 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, 997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
998 &eax, &ebx, &ecx, &edx, &dummy)) 998 &eax, &ebx, &ecx, &edx, &dummy))
999 return (eax >> 8) & 0xff; 999 return (eax >> 8) & 0xff;
1000 *status = ebx; 1000 *status = ebx;
1001 *bat = ecx; 1001 *bat = ecx;
@@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
1011static int apm_get_battery_status(u_short which, u_short *status, 1011static int apm_get_battery_status(u_short which, u_short *status,
1012 u_short *bat, u_short *life, u_short *nbat) 1012 u_short *bat, u_short *life, u_short *nbat)
1013{ 1013{
1014 u32 eax; 1014 u32 eax;
1015 u32 ebx; 1015 u32 ebx;
1016 u32 ecx; 1016 u32 ecx;
1017 u32 edx; 1017 u32 edx;
1018 u32 esi; 1018 u32 esi;
1019 1019
1020 if (apm_info.connection_version < 0x0102) { 1020 if (apm_info.connection_version < 0x0102) {
1021 /* pretend we only have one battery. */ 1021 /* pretend we only have one battery. */
@@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
1026 } 1026 }
1027 1027
1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, 1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1029 &ebx, &ecx, &edx, &esi)) 1029 &ebx, &ecx, &edx, &esi))
1030 return (eax >> 8) & 0xff; 1030 return (eax >> 8) & 0xff;
1031 *status = ebx; 1031 *status = ebx;
1032 *bat = ecx; 1032 *bat = ecx;
@@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status,
1044 * Activate or deactive power management on either a specific device 1044 * Activate or deactive power management on either a specific device
1045 * or the entire system (%APM_DEVICE_ALL). 1045 * or the entire system (%APM_DEVICE_ALL).
1046 */ 1046 */
1047 1047
1048static int apm_engage_power_management(u_short device, int enable) 1048static int apm_engage_power_management(u_short device, int enable)
1049{ 1049{
1050 u32 eax; 1050 u32 eax;
1051 1051
1052 if ((enable == 0) && (device == APM_DEVICE_ALL) 1052 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED)) 1053 && (apm_info.bios.flags & APM_BIOS_DISABLED))
@@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable)
1074 * all video devices. Typically the BIOS will do laptop backlight and 1074 * all video devices. Typically the BIOS will do laptop backlight and
1075 * monitor powerdown for us. 1075 * monitor powerdown for us.
1076 */ 1076 */
1077 1077
1078static int apm_console_blank(int blank) 1078static int apm_console_blank(int blank)
1079{ 1079{
1080 int error = APM_NOT_ENGAGED; /* silence gcc */ 1080 int error = APM_NOT_ENGAGED; /* silence gcc */
@@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as)
1126 1126
1127static void queue_event(apm_event_t event, struct apm_user *sender) 1127static void queue_event(apm_event_t event, struct apm_user *sender)
1128{ 1128{
1129 struct apm_user * as; 1129 struct apm_user *as;
1130 1130
1131 spin_lock(&user_list_lock); 1131 spin_lock(&user_list_lock);
1132 if (user_list == NULL) 1132 if (user_list == NULL)
@@ -1174,11 +1174,11 @@ static void reinit_timer(void)
1174 1174
1175 spin_lock_irqsave(&i8253_lock, flags); 1175 spin_lock_irqsave(&i8253_lock, flags);
1176 /* set the clock to HZ */ 1176 /* set the clock to HZ */
1177 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1177 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1178 udelay(10); 1178 udelay(10);
1179 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ 1179 outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
1180 udelay(10); 1180 udelay(10);
1181 outb(LATCH >> 8, PIT_CH0); /* MSB */ 1181 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1182 udelay(10); 1182 udelay(10);
1183 spin_unlock_irqrestore(&i8253_lock, flags); 1183 spin_unlock_irqrestore(&i8253_lock, flags);
1184#endif 1184#endif
@@ -1186,7 +1186,7 @@ static void reinit_timer(void)
1186 1186
1187static int suspend(int vetoable) 1187static int suspend(int vetoable)
1188{ 1188{
1189 int err; 1189 int err;
1190 struct apm_user *as; 1190 struct apm_user *as;
1191 1191
1192 if (pm_send_all(PM_SUSPEND, (void *)3)) { 1192 if (pm_send_all(PM_SUSPEND, (void *)3)) {
@@ -1239,7 +1239,7 @@ static int suspend(int vetoable)
1239 1239
1240static void standby(void) 1240static void standby(void)
1241{ 1241{
1242 int err; 1242 int err;
1243 1243
1244 local_irq_disable(); 1244 local_irq_disable();
1245 device_power_down(PMSG_SUSPEND); 1245 device_power_down(PMSG_SUSPEND);
@@ -1256,8 +1256,8 @@ static void standby(void)
1256 1256
1257static apm_event_t get_event(void) 1257static apm_event_t get_event(void)
1258{ 1258{
1259 int error; 1259 int error;
1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */ 1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */
1261 apm_eventinfo_t info; 1261 apm_eventinfo_t info;
1262 1262
1263 static int notified; 1263 static int notified;
@@ -1275,9 +1275,9 @@ static apm_event_t get_event(void)
1275 1275
1276static void check_events(void) 1276static void check_events(void)
1277{ 1277{
1278 apm_event_t event; 1278 apm_event_t event;
1279 static unsigned long last_resume; 1279 static unsigned long last_resume;
1280 static int ignore_bounce; 1280 static int ignore_bounce;
1281 1281
1282 while ((event = get_event()) != 0) { 1282 while ((event = get_event()) != 0) {
1283 if (debug) { 1283 if (debug) {
@@ -1289,7 +1289,7 @@ static void check_events(void)
1289 "event 0x%02x\n", event); 1289 "event 0x%02x\n", event);
1290 } 1290 }
1291 if (ignore_bounce 1291 if (ignore_bounce
1292 && ((jiffies - last_resume) > bounce_interval)) 1292 && (time_after(jiffies, last_resume + bounce_interval)))
1293 ignore_bounce = 0; 1293 ignore_bounce = 0;
1294 1294
1295 switch (event) { 1295 switch (event) {
@@ -1357,7 +1357,7 @@ static void check_events(void)
1357 /* 1357 /*
1358 * We are not allowed to reject a critical suspend. 1358 * We are not allowed to reject a critical suspend.
1359 */ 1359 */
1360 (void) suspend(0); 1360 (void)suspend(0);
1361 break; 1361 break;
1362 } 1362 }
1363 } 1363 }
@@ -1365,12 +1365,12 @@ static void check_events(void)
1365 1365
1366static void apm_event_handler(void) 1366static void apm_event_handler(void)
1367{ 1367{
1368 static int pending_count = 4; 1368 static int pending_count = 4;
1369 int err; 1369 int err;
1370 1370
1371 if ((standbys_pending > 0) || (suspends_pending > 0)) { 1371 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1372 if ((apm_info.connection_version > 0x100) && 1372 if ((apm_info.connection_version > 0x100) &&
1373 (pending_count-- <= 0)) { 1373 (pending_count-- <= 0)) {
1374 pending_count = 4; 1374 pending_count = 4;
1375 if (debug) 1375 if (debug)
1376 printk(KERN_DEBUG "apm: setting state busy\n"); 1376 printk(KERN_DEBUG "apm: setting state busy\n");
@@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func)
1418 1418
1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) 1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1420{ 1420{
1421 struct apm_user * as; 1421 struct apm_user *as;
1422 int i; 1422 int i;
1423 apm_event_t event; 1423 apm_event_t event;
1424 1424
1425 as = fp->private_data; 1425 as = fp->private_data;
1426 if (check_apm_user(as, "read")) 1426 if (check_apm_user(as, "read"))
@@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
1459 return 0; 1459 return 0;
1460} 1460}
1461 1461
1462static unsigned int do_poll(struct file *fp, poll_table * wait) 1462static unsigned int do_poll(struct file *fp, poll_table *wait)
1463{ 1463{
1464 struct apm_user * as; 1464 struct apm_user *as;
1465 1465
1466 as = fp->private_data; 1466 as = fp->private_data;
1467 if (check_apm_user(as, "poll")) 1467 if (check_apm_user(as, "poll"))
@@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait)
1472 return 0; 1472 return 0;
1473} 1473}
1474 1474
1475static int do_ioctl(struct inode * inode, struct file *filp, 1475static int do_ioctl(struct inode *inode, struct file *filp,
1476 u_int cmd, u_long arg) 1476 u_int cmd, u_long arg)
1477{ 1477{
1478 struct apm_user * as; 1478 struct apm_user *as;
1479 1479
1480 as = filp->private_data; 1480 as = filp->private_data;
1481 if (check_apm_user(as, "ioctl")) 1481 if (check_apm_user(as, "ioctl"))
@@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp,
1515 return 0; 1515 return 0;
1516} 1516}
1517 1517
1518static int do_release(struct inode * inode, struct file * filp) 1518static int do_release(struct inode *inode, struct file *filp)
1519{ 1519{
1520 struct apm_user * as; 1520 struct apm_user *as;
1521 1521
1522 as = filp->private_data; 1522 as = filp->private_data;
1523 if (check_apm_user(as, "release")) 1523 if (check_apm_user(as, "release"))
@@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp)
1533 if (suspends_pending <= 0) 1533 if (suspends_pending <= 0)
1534 (void) suspend(1); 1534 (void) suspend(1);
1535 } 1535 }
1536 spin_lock(&user_list_lock); 1536 spin_lock(&user_list_lock);
1537 if (user_list == as) 1537 if (user_list == as)
1538 user_list = as->next; 1538 user_list = as->next;
1539 else { 1539 else {
1540 struct apm_user * as1; 1540 struct apm_user *as1;
1541 1541
1542 for (as1 = user_list; 1542 for (as1 = user_list;
1543 (as1 != NULL) && (as1->next != as); 1543 (as1 != NULL) && (as1->next != as);
@@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp)
1553 return 0; 1553 return 0;
1554} 1554}
1555 1555
1556static int do_open(struct inode * inode, struct file * filp) 1556static int do_open(struct inode *inode, struct file *filp)
1557{ 1557{
1558 struct apm_user * as; 1558 struct apm_user *as;
1559 1559
1560 as = kmalloc(sizeof(*as), GFP_KERNEL); 1560 as = kmalloc(sizeof(*as), GFP_KERNEL);
1561 if (as == NULL) { 1561 if (as == NULL) {
@@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp)
1569 as->suspends_read = as->standbys_read = 0; 1569 as->suspends_read = as->standbys_read = 0;
1570 /* 1570 /*
1571 * XXX - this is a tiny bit broken, when we consider BSD 1571 * XXX - this is a tiny bit broken, when we consider BSD
1572 * process accounting. If the device is opened by root, we 1572 * process accounting. If the device is opened by root, we
1573 * instantly flag that we used superuser privs. Who knows, 1573 * instantly flag that we used superuser privs. Who knows,
1574 * we might close the device immediately without doing a 1574 * we might close the device immediately without doing a
1575 * privileged operation -- cevans 1575 * privileged operation -- cevans
@@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v)
1652 8) min = minutes; sec = seconds */ 1652 8) min = minutes; sec = seconds */
1653 1653
1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1655 driver_version, 1655 driver_version,
1656 (apm_info.bios.version >> 8) & 0xff, 1656 (apm_info.bios.version >> 8) & 0xff,
1657 apm_info.bios.version & 0xff, 1657 apm_info.bios.version & 0xff,
1658 apm_info.bios.flags, 1658 apm_info.bios.flags,
1659 ac_line_status, 1659 ac_line_status,
1660 battery_status, 1660 battery_status,
1661 battery_flag, 1661 battery_flag,
1662 percentage, 1662 percentage,
1663 time_units, 1663 time_units,
1664 units); 1664 units);
1665 return 0; 1665 return 0;
1666} 1666}
1667 1667
@@ -1684,8 +1684,8 @@ static int apm(void *unused)
1684 unsigned short cx; 1684 unsigned short cx;
1685 unsigned short dx; 1685 unsigned short dx;
1686 int error; 1686 int error;
1687 char * power_stat; 1687 char *power_stat;
1688 char * bat_stat; 1688 char *bat_stat;
1689 1689
1690#ifdef CONFIG_SMP 1690#ifdef CONFIG_SMP
1691 /* 2002/08/01 - WT 1691 /* 2002/08/01 - WT
@@ -1744,23 +1744,41 @@ static int apm(void *unused)
1744 } 1744 }
1745 } 1745 }
1746 1746
1747 if (debug && (num_online_cpus() == 1 || smp )) { 1747 if (debug && (num_online_cpus() == 1 || smp)) {
1748 error = apm_get_power_status(&bx, &cx, &dx); 1748 error = apm_get_power_status(&bx, &cx, &dx);
1749 if (error) 1749 if (error)
1750 printk(KERN_INFO "apm: power status not available\n"); 1750 printk(KERN_INFO "apm: power status not available\n");
1751 else { 1751 else {
1752 switch ((bx >> 8) & 0xff) { 1752 switch ((bx >> 8) & 0xff) {
1753 case 0: power_stat = "off line"; break; 1753 case 0:
1754 case 1: power_stat = "on line"; break; 1754 power_stat = "off line";
1755 case 2: power_stat = "on backup power"; break; 1755 break;
1756 default: power_stat = "unknown"; break; 1756 case 1:
1757 power_stat = "on line";
1758 break;
1759 case 2:
1760 power_stat = "on backup power";
1761 break;
1762 default:
1763 power_stat = "unknown";
1764 break;
1757 } 1765 }
1758 switch (bx & 0xff) { 1766 switch (bx & 0xff) {
1759 case 0: bat_stat = "high"; break; 1767 case 0:
1760 case 1: bat_stat = "low"; break; 1768 bat_stat = "high";
1761 case 2: bat_stat = "critical"; break; 1769 break;
1762 case 3: bat_stat = "charging"; break; 1770 case 1:
1763 default: bat_stat = "unknown"; break; 1771 bat_stat = "low";
1772 break;
1773 case 2:
1774 bat_stat = "critical";
1775 break;
1776 case 3:
1777 bat_stat = "charging";
1778 break;
1779 default:
1780 bat_stat = "unknown";
1781 break;
1764 } 1782 }
1765 printk(KERN_INFO 1783 printk(KERN_INFO
1766 "apm: AC %s, battery status %s, battery life ", 1784 "apm: AC %s, battery status %s, battery life ",
@@ -1777,8 +1795,8 @@ static int apm(void *unused)
1777 printk("unknown\n"); 1795 printk("unknown\n");
1778 else 1796 else
1779 printk("%d %s\n", dx & 0x7fff, 1797 printk("%d %s\n", dx & 0x7fff,
1780 (dx & 0x8000) ? 1798 (dx & 0x8000) ?
1781 "minutes" : "seconds"); 1799 "minutes" : "seconds");
1782 } 1800 }
1783 } 1801 }
1784 } 1802 }
@@ -1803,7 +1821,7 @@ static int apm(void *unused)
1803#ifndef MODULE 1821#ifndef MODULE
1804static int __init apm_setup(char *str) 1822static int __init apm_setup(char *str)
1805{ 1823{
1806 int invert; 1824 int invert;
1807 1825
1808 while ((str != NULL) && (*str != '\0')) { 1826 while ((str != NULL) && (*str != '\0')) {
1809 if (strncmp(str, "off", 3) == 0) 1827 if (strncmp(str, "off", 3) == 0)
@@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str)
1828 if ((strncmp(str, "power-off", 9) == 0) || 1846 if ((strncmp(str, "power-off", 9) == 0) ||
1829 (strncmp(str, "power_off", 9) == 0)) 1847 (strncmp(str, "power_off", 9) == 0))
1830 power_off = !invert; 1848 power_off = !invert;
1831 if (strncmp(str, "smp", 3) == 0) 1849 if (strncmp(str, "smp", 3) == 0) {
1832 {
1833 smp = !invert; 1850 smp = !invert;
1834 idle_threshold = 100; 1851 idle_threshold = 100;
1835 } 1852 }
1836 if ((strncmp(str, "allow-ints", 10) == 0) || 1853 if ((strncmp(str, "allow-ints", 10) == 0) ||
1837 (strncmp(str, "allow_ints", 10) == 0)) 1854 (strncmp(str, "allow_ints", 10) == 0))
1838 apm_info.allow_ints = !invert; 1855 apm_info.allow_ints = !invert;
1839 if ((strncmp(str, "broken-psr", 10) == 0) || 1856 if ((strncmp(str, "broken-psr", 10) == 0) ||
1840 (strncmp(str, "broken_psr", 10) == 0)) 1857 (strncmp(str, "broken_psr", 10) == 0))
1841 apm_info.get_power_status_broken = !invert; 1858 apm_info.get_power_status_broken = !invert;
@@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d)
1881 */ 1898 */
1882static int __init broken_ps2_resume(const struct dmi_system_id *d) 1899static int __init broken_ps2_resume(const struct dmi_system_id *d)
1883{ 1900{
1884 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); 1901 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
1902 "workaround hopefully not needed.\n", d->ident);
1885 return 0; 1903 return 0;
1886} 1904}
1887 1905
@@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d)
1890{ 1908{
1891 if (apm_info.realmode_power_off == 0) { 1909 if (apm_info.realmode_power_off == 0) {
1892 apm_info.realmode_power_off = 1; 1910 apm_info.realmode_power_off = 1;
1893 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); 1911 printk(KERN_INFO "%s bios detected. "
1912 "Using realmode poweroff only.\n", d->ident);
1894 } 1913 }
1895 return 0; 1914 return 0;
1896} 1915}
@@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d)
1900{ 1919{
1901 if (apm_info.allow_ints == 0) { 1920 if (apm_info.allow_ints == 0) {
1902 apm_info.allow_ints = 1; 1921 apm_info.allow_ints = 1;
1903 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); 1922 printk(KERN_INFO "%s machine detected. "
1923 "Enabling interrupts during APM calls.\n", d->ident);
1904 } 1924 }
1905 return 0; 1925 return 0;
1906} 1926}
@@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d)
1910{ 1930{
1911 if (apm_info.disabled == 0) { 1931 if (apm_info.disabled == 0) {
1912 apm_info.disabled = 1; 1932 apm_info.disabled = 1;
1913 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1933 printk(KERN_INFO "%s machine detected. "
1934 "Disabling APM.\n", d->ident);
1914 } 1935 }
1915 return 0; 1936 return 0;
1916} 1937}
@@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1919{ 1940{
1920 if (apm_info.disabled == 0) { 1941 if (apm_info.disabled == 0) {
1921 apm_info.disabled = 1; 1942 apm_info.disabled = 1;
1922 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1943 printk(KERN_INFO "%s machine detected. "
1944 "Disabling APM.\n", d->ident);
1923 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1945 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1924 printk(KERN_INFO "download from support.intel.com \n"); 1946 printk(KERN_INFO "download from support.intel.com \n");
1925 } 1947 }
@@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1931{ 1953{
1932 if (apm_info.forbid_idle == 0) { 1954 if (apm_info.forbid_idle == 0) {
1933 apm_info.forbid_idle = 1; 1955 apm_info.forbid_idle = 1;
1934 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); 1956 printk(KERN_INFO "%s machine detected. "
1957 "Disabling APM idle calls.\n", d->ident);
1935 } 1958 }
1936 return 0; 1959 return 0;
1937} 1960}
@@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1954static int __init broken_apm_power(const struct dmi_system_id *d) 1977static int __init broken_apm_power(const struct dmi_system_id *d)
1955{ 1978{
1956 apm_info.get_power_status_broken = 1; 1979 apm_info.get_power_status_broken = 1;
1957 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); 1980 printk(KERN_WARNING "BIOS strings suggest APM bugs, "
1981 "disabling power status reporting.\n");
1958 return 0; 1982 return 0;
1959} 1983}
1960 1984
@@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d)
1965static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) 1989static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
1966{ 1990{
1967 apm_info.get_power_status_swabinminutes = 1; 1991 apm_info.get_power_status_swabinminutes = 1;
1968 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); 1992 printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
1993 "in minutes and wrong byte order.\n");
1969 return 0; 1994 return 0;
1970} 1995}
1971 1996
@@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
1990 apm_is_horked, "Dell Inspiron 2500", 2015 apm_is_horked, "Dell Inspiron 2500",
1991 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2016 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1992 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2017 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
1993 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2018 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
1994 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2019 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
1995 }, 2020 },
1996 { /* Allow interrupts during suspend on Dell Inspiron laptops*/ 2021 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
1997 set_apm_ints, "Dell Inspiron", { 2022 set_apm_ints, "Dell Inspiron", {
@@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2014 apm_is_horked, "Dell Dimension 4100", 2039 apm_is_horked, "Dell Dimension 4100",
2015 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2040 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2016 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), 2041 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2017 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), 2042 DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2018 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2043 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2019 }, 2044 },
2020 { /* Allow interrupts during suspend on Compaq Laptops*/ 2045 { /* Allow interrupts during suspend on Compaq Laptops*/
2021 set_apm_ints, "Compaq 12XL125", 2046 set_apm_ints, "Compaq 12XL125",
2022 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), 2047 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2023 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), 2048 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2024 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), 2049 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2025 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, 2050 DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
2026 }, 2051 },
2027 { /* Allow interrupts during APM or the clock goes slow */ 2052 { /* Allow interrupts during APM or the clock goes slow */
2028 set_apm_ints, "ASUSTeK", 2053 set_apm_ints, "ASUSTeK",
@@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2064 apm_is_horked, "Sharp PC-PJ/AX", 2089 apm_is_horked, "Sharp PC-PJ/AX",
2065 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), 2090 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2066 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), 2091 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2067 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), 2092 DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
2068 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, 2093 DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
2069 }, 2094 },
2070 { /* APM crashes */ 2095 { /* APM crashes */
2071 apm_is_horked, "Dell Inspiron 2500", 2096 apm_is_horked, "Dell Inspiron 2500",
2072 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2097 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2073 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2098 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2074 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2099 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2075 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2100 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2076 }, 2101 },
2077 { /* APM idle hangs */ 2102 { /* APM idle hangs */
2078 apm_likes_to_melt, "Jabil AMD", 2103 apm_likes_to_melt, "Jabil AMD",
@@ -2203,11 +2228,11 @@ static int __init apm_init(void)
2203 return -ENODEV; 2228 return -ENODEV;
2204 } 2229 }
2205 printk(KERN_INFO 2230 printk(KERN_INFO
2206 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", 2231 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2207 ((apm_info.bios.version >> 8) & 0xff), 2232 ((apm_info.bios.version >> 8) & 0xff),
2208 (apm_info.bios.version & 0xff), 2233 (apm_info.bios.version & 0xff),
2209 apm_info.bios.flags, 2234 apm_info.bios.flags,
2210 driver_version); 2235 driver_version);
2211 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { 2236 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2212 printk(KERN_INFO "apm: no 32 bit BIOS support\n"); 2237 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2213 return -ENODEV; 2238 return -ENODEV;
@@ -2312,9 +2337,9 @@ static int __init apm_init(void)
2312 } 2337 }
2313 wake_up_process(kapmd_task); 2338 wake_up_process(kapmd_task);
2314 2339
2315 if (num_online_cpus() > 1 && !smp ) { 2340 if (num_online_cpus() > 1 && !smp) {
2316 printk(KERN_NOTICE 2341 printk(KERN_NOTICE
2317 "apm: disabled - APM is not SMP safe (power off active).\n"); 2342 "apm: disabled - APM is not SMP safe (power off active).\n");
2318 return 0; 2343 return 0;
2319 } 2344 }
2320 2345
@@ -2339,7 +2364,7 @@ static int __init apm_init(void)
2339 2364
2340static void __exit apm_exit(void) 2365static void __exit apm_exit(void)
2341{ 2366{
2342 int error; 2367 int error;
2343 2368
2344 if (set_pm_idle) { 2369 if (set_pm_idle) {
2345 pm_idle = original_pm_idle; 2370 pm_idle = original_pm_idle;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 0e45981b2dd7..afd84463b712 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
38 38
39void foo(void) 39void foo(void)
40{ 40{
41 OFFSET(SIGCONTEXT_eax, sigcontext, eax); 41 OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
42 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); 42 OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
43 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); 43 OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
44 OFFSET(SIGCONTEXT_edx, sigcontext, edx); 44 OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
45 OFFSET(SIGCONTEXT_esi, sigcontext, esi); 45 OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
46 OFFSET(SIGCONTEXT_edi, sigcontext, edi); 46 OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
47 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); 47 OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
48 OFFSET(SIGCONTEXT_esp, sigcontext, esp); 48 OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
49 OFFSET(SIGCONTEXT_eip, sigcontext, eip); 49 OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
50 BLANK(); 50 BLANK();
51 51
52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86); 52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
@@ -70,39 +70,38 @@ void foo(void)
70 OFFSET(TI_cpu, thread_info, cpu); 70 OFFSET(TI_cpu, thread_info, cpu);
71 BLANK(); 71 BLANK();
72 72
73 OFFSET(GDS_size, Xgt_desc_struct, size); 73 OFFSET(GDS_size, desc_ptr, size);
74 OFFSET(GDS_address, Xgt_desc_struct, address); 74 OFFSET(GDS_address, desc_ptr, address);
75 OFFSET(GDS_pad, Xgt_desc_struct, pad);
76 BLANK(); 75 BLANK();
77 76
78 OFFSET(PT_EBX, pt_regs, ebx); 77 OFFSET(PT_EBX, pt_regs, bx);
79 OFFSET(PT_ECX, pt_regs, ecx); 78 OFFSET(PT_ECX, pt_regs, cx);
80 OFFSET(PT_EDX, pt_regs, edx); 79 OFFSET(PT_EDX, pt_regs, dx);
81 OFFSET(PT_ESI, pt_regs, esi); 80 OFFSET(PT_ESI, pt_regs, si);
82 OFFSET(PT_EDI, pt_regs, edi); 81 OFFSET(PT_EDI, pt_regs, di);
83 OFFSET(PT_EBP, pt_regs, ebp); 82 OFFSET(PT_EBP, pt_regs, bp);
84 OFFSET(PT_EAX, pt_regs, eax); 83 OFFSET(PT_EAX, pt_regs, ax);
85 OFFSET(PT_DS, pt_regs, xds); 84 OFFSET(PT_DS, pt_regs, ds);
86 OFFSET(PT_ES, pt_regs, xes); 85 OFFSET(PT_ES, pt_regs, es);
87 OFFSET(PT_FS, pt_regs, xfs); 86 OFFSET(PT_FS, pt_regs, fs);
88 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 87 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
89 OFFSET(PT_EIP, pt_regs, eip); 88 OFFSET(PT_EIP, pt_regs, ip);
90 OFFSET(PT_CS, pt_regs, xcs); 89 OFFSET(PT_CS, pt_regs, cs);
91 OFFSET(PT_EFLAGS, pt_regs, eflags); 90 OFFSET(PT_EFLAGS, pt_regs, flags);
92 OFFSET(PT_OLDESP, pt_regs, esp); 91 OFFSET(PT_OLDESP, pt_regs, sp);
93 OFFSET(PT_OLDSS, pt_regs, xss); 92 OFFSET(PT_OLDSS, pt_regs, ss);
94 BLANK(); 93 BLANK();
95 94
96 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 95 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
97 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 96 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
98 BLANK(); 97 BLANK();
99 98
100 OFFSET(pbe_address, pbe, address); 99 OFFSET(pbe_address, pbe, address);
101 OFFSET(pbe_orig_address, pbe, orig_address); 100 OFFSET(pbe_orig_address, pbe, orig_address);
102 OFFSET(pbe_next, pbe, next); 101 OFFSET(pbe_next, pbe, next);
103 102
104 /* Offset from the sysenter stack to tss.esp0 */ 103 /* Offset from the sysenter stack to tss.sp0 */
105 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - 104 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
106 sizeof(struct tss_struct)); 105 sizeof(struct tss_struct));
107 106
108 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 107 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
@@ -111,8 +110,6 @@ void foo(void)
111 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); 110 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
112 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); 111 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
113 112
114 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
115
116 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 113 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
117 114
118#ifdef CONFIG_PARAVIRT 115#ifdef CONFIG_PARAVIRT
@@ -123,7 +120,7 @@ void foo(void)
123 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 120 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
124 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 121 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
125 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 122 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
126 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); 123 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
127 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); 124 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
128#endif 125#endif
129 126
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d1b6ed98774e..494e1e096ee6 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -38,7 +38,6 @@ int main(void)
38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) 38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
39 ENTRY(state); 39 ENTRY(state);
40 ENTRY(flags); 40 ENTRY(flags);
41 ENTRY(thread);
42 ENTRY(pid); 41 ENTRY(pid);
43 BLANK(); 42 BLANK();
44#undef ENTRY 43#undef ENTRY
@@ -47,6 +46,9 @@ int main(void)
47 ENTRY(addr_limit); 46 ENTRY(addr_limit);
48 ENTRY(preempt_count); 47 ENTRY(preempt_count);
49 ENTRY(status); 48 ENTRY(status);
49#ifdef CONFIG_IA32_EMULATION
50 ENTRY(sysenter_return);
51#endif
50 BLANK(); 52 BLANK();
51#undef ENTRY 53#undef ENTRY
52#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) 54#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
@@ -59,17 +61,31 @@ int main(void)
59 ENTRY(data_offset); 61 ENTRY(data_offset);
60 BLANK(); 62 BLANK();
61#undef ENTRY 63#undef ENTRY
64#ifdef CONFIG_PARAVIRT
65 BLANK();
66 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
67 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
68 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
69 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
70 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
71 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
72 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
73 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
74 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
75#endif
76
77
62#ifdef CONFIG_IA32_EMULATION 78#ifdef CONFIG_IA32_EMULATION
63#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 79#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
64 ENTRY(eax); 80 ENTRY(ax);
65 ENTRY(ebx); 81 ENTRY(bx);
66 ENTRY(ecx); 82 ENTRY(cx);
67 ENTRY(edx); 83 ENTRY(dx);
68 ENTRY(esi); 84 ENTRY(si);
69 ENTRY(edi); 85 ENTRY(di);
70 ENTRY(ebp); 86 ENTRY(bp);
71 ENTRY(esp); 87 ENTRY(sp);
72 ENTRY(eip); 88 ENTRY(ip);
73 BLANK(); 89 BLANK();
74#undef ENTRY 90#undef ENTRY
75 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
@@ -81,14 +97,14 @@ int main(void)
81 DEFINE(pbe_next, offsetof(struct pbe, next)); 97 DEFINE(pbe_next, offsetof(struct pbe, next));
82 BLANK(); 98 BLANK();
83#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) 99#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
84 ENTRY(rbx); 100 ENTRY(bx);
85 ENTRY(rbx); 101 ENTRY(bx);
86 ENTRY(rcx); 102 ENTRY(cx);
87 ENTRY(rdx); 103 ENTRY(dx);
88 ENTRY(rsp); 104 ENTRY(sp);
89 ENTRY(rbp); 105 ENTRY(bp);
90 ENTRY(rsi); 106 ENTRY(si);
91 ENTRY(rdi); 107 ENTRY(di);
92 ENTRY(r8); 108 ENTRY(r8);
93 ENTRY(r9); 109 ENTRY(r9);
94 ENTRY(r10); 110 ENTRY(r10);
@@ -97,7 +113,7 @@ int main(void)
97 ENTRY(r13); 113 ENTRY(r13);
98 ENTRY(r14); 114 ENTRY(r14);
99 ENTRY(r15); 115 ENTRY(r15);
100 ENTRY(eflags); 116 ENTRY(flags);
101 BLANK(); 117 BLANK();
102#undef ENTRY 118#undef ENTRY
103#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 119#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
@@ -108,7 +124,7 @@ int main(void)
108 ENTRY(cr8); 124 ENTRY(cr8);
109 BLANK(); 125 BLANK();
110#undef ENTRY 126#undef ENTRY
111 DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); 127 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
112 BLANK(); 128 BLANK();
113 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); 129 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
114 BLANK(); 130 BLANK();
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 0b9860530a6b..30f25a75fe28 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Implement 'Simple Boot Flag Specification 2.0' 2 * Implement 'Simple Boot Flag Specification 2.0'
3 */ 3 */
4
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <linux/kernel.h> 5#include <linux/kernel.h>
8#include <linux/init.h> 6#include <linux/init.h>
@@ -14,40 +12,38 @@
14 12
15#include <linux/mc146818rtc.h> 13#include <linux/mc146818rtc.h>
16 14
17
18#define SBF_RESERVED (0x78) 15#define SBF_RESERVED (0x78)
19#define SBF_PNPOS (1<<0) 16#define SBF_PNPOS (1<<0)
20#define SBF_BOOTING (1<<1) 17#define SBF_BOOTING (1<<1)
21#define SBF_DIAG (1<<2) 18#define SBF_DIAG (1<<2)
22#define SBF_PARITY (1<<7) 19#define SBF_PARITY (1<<7)
23 20
24
25int sbf_port __initdata = -1; /* set via acpi_boot_init() */ 21int sbf_port __initdata = -1; /* set via acpi_boot_init() */
26 22
27
28static int __init parity(u8 v) 23static int __init parity(u8 v)
29{ 24{
30 int x = 0; 25 int x = 0;
31 int i; 26 int i;
32 27
33 for(i=0;i<8;i++) 28 for (i = 0; i < 8; i++) {
34 { 29 x ^= (v & 1);
35 x^=(v&1); 30 v >>= 1;
36 v>>=1;
37 } 31 }
32
38 return x; 33 return x;
39} 34}
40 35
41static void __init sbf_write(u8 v) 36static void __init sbf_write(u8 v)
42{ 37{
43 unsigned long flags; 38 unsigned long flags;
44 if(sbf_port != -1) 39
45 { 40 if (sbf_port != -1) {
46 v &= ~SBF_PARITY; 41 v &= ~SBF_PARITY;
47 if(!parity(v)) 42 if (!parity(v))
48 v|=SBF_PARITY; 43 v |= SBF_PARITY;
49 44
50 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); 45 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
46 sbf_port, v);
51 47
52 spin_lock_irqsave(&rtc_lock, flags); 48 spin_lock_irqsave(&rtc_lock, flags);
53 CMOS_WRITE(v, sbf_port); 49 CMOS_WRITE(v, sbf_port);
@@ -57,33 +53,41 @@ static void __init sbf_write(u8 v)
57 53
58static u8 __init sbf_read(void) 54static u8 __init sbf_read(void)
59{ 55{
60 u8 v;
61 unsigned long flags; 56 unsigned long flags;
62 if(sbf_port == -1) 57 u8 v;
58
59 if (sbf_port == -1)
63 return 0; 60 return 0;
61
64 spin_lock_irqsave(&rtc_lock, flags); 62 spin_lock_irqsave(&rtc_lock, flags);
65 v = CMOS_READ(sbf_port); 63 v = CMOS_READ(sbf_port);
66 spin_unlock_irqrestore(&rtc_lock, flags); 64 spin_unlock_irqrestore(&rtc_lock, flags);
65
67 return v; 66 return v;
68} 67}
69 68
70static int __init sbf_value_valid(u8 v) 69static int __init sbf_value_valid(u8 v)
71{ 70{
72 if(v&SBF_RESERVED) /* Reserved bits */ 71 if (v & SBF_RESERVED) /* Reserved bits */
73 return 0; 72 return 0;
74 if(!parity(v)) 73 if (!parity(v))
75 return 0; 74 return 0;
75
76 return 1; 76 return 1;
77} 77}
78 78
79static int __init sbf_init(void) 79static int __init sbf_init(void)
80{ 80{
81 u8 v; 81 u8 v;
82 if(sbf_port == -1) 82
83 if (sbf_port == -1)
83 return 0; 84 return 0;
85
84 v = sbf_read(); 86 v = sbf_read();
85 if(!sbf_value_valid(v)) 87 if (!sbf_value_valid(v)) {
86 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); 88 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
89 "CMOS RAM was invalid\n", v);
90 }
87 91
88 v &= ~SBF_RESERVED; 92 v &= ~SBF_RESERVED;
89 v &= ~SBF_BOOTING; 93 v &= ~SBF_BOOTING;
@@ -92,7 +96,7 @@ static int __init sbf_init(void)
92 v |= SBF_PNPOS; 96 v |= SBF_PNPOS;
93#endif 97#endif
94 sbf_write(v); 98 sbf_write(v);
99
95 return 0; 100 return 0;
96} 101}
97
98module_init(sbf_init); 102module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 9a189cef6404..8f520f93ffd4 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
13void __init check_bugs(void) 13void __init check_bugs(void)
14{ 14{
15 identify_cpu(&boot_cpu_data); 15 identify_cpu(&boot_cpu_data);
16 mtrr_bp_init();
17#if !defined(CONFIG_SMP) 16#if !defined(CONFIG_SMP)
18 printk("CPU: "); 17 printk("CPU: ");
19 print_cpu_info(&boot_cpu_data); 18 print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 3e91d3ee26ec..238468ae1993 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
45 &regs[CR_ECX], &regs[CR_EDX]); 45 &regs[CR_ECX], &regs[CR_EDX]);
46 46
47 if (regs[cb->reg] & (1 << cb->bit)) 47 if (regs[cb->reg] & (1 << cb->bit))
48 set_bit(cb->feature, c->x86_capability); 48 set_cpu_cap(c, cb->feature);
49 } 49 }
50} 50}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1ff88c7f45cf..06fa159232fd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void)
63 63
64int force_mwait __cpuinitdata; 64int force_mwait __cpuinitdata;
65 65
66void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
67{
68 if (cpuid_eax(0x80000000) >= 0x80000007) {
69 c->x86_power = cpuid_edx(0x80000007);
70 if (c->x86_power & (1<<8))
71 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
72 }
73}
74
66static void __cpuinit init_amd(struct cpuinfo_x86 *c) 75static void __cpuinit init_amd(struct cpuinfo_x86 *c)
67{ 76{
68 u32 l, h; 77 u32 l, h;
@@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
85 } 94 }
86#endif 95#endif
87 96
97 early_init_amd(c);
98
88 /* 99 /*
89 * FIXME: We should handle the K5 here. Set up the write 100 * FIXME: We should handle the K5 here. Set up the write
90 * range and also turn on MSR 83 bits 4 and 31 (write alloc, 101 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
@@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
257 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 268 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
258 } 269 }
259 270
260 if (cpuid_eax(0x80000000) >= 0x80000007) {
261 c->x86_power = cpuid_edx(0x80000007);
262 if (c->x86_power & (1<<8))
263 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
264 }
265
266#ifdef CONFIG_X86_HT 271#ifdef CONFIG_X86_HT
267 /* 272 /*
268 * On a AMD multi core setup the lower bits of the APIC id 273 * On a AMD multi core setup the lower bits of the APIC id
@@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
295 local_apic_timer_disabled = 1; 300 local_apic_timer_disabled = 1;
296#endif 301#endif
297 302
298 if (c->x86 == 0x10 && !force_mwait)
299 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
300
301 /* K6s reports MCEs but don't actually have all the MSRs */ 303 /* K6s reports MCEs but don't actually have all the MSRs */
302 if (c->x86 < 6) 304 if (c->x86 < 6)
303 clear_bit(X86_FEATURE_MCE, c->x86_capability); 305 clear_bit(X86_FEATURE_MCE, c->x86_capability);
306
307 if (cpu_has_xmm)
308 set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
304} 309}
305 310
306static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 311static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 205fd5ba57f7..9b95edcfc6ae 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
11#include <linux/utsname.h> 11#include <linux/utsname.h>
12#include <asm/bugs.h> 12#include <asm/bugs.h>
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/processor-flags.h>
14#include <asm/i387.h> 15#include <asm/i387.h>
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/paravirt.h> 17#include <asm/paravirt.h>
@@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium);
35static int __init no_387(char *s) 36static int __init no_387(char *s)
36{ 37{
37 boot_cpu_data.hard_math = 0; 38 boot_cpu_data.hard_math = 0;
38 write_cr0(0xE | read_cr0()); 39 write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
39 return 1; 40 return 1;
40} 41}
41 42
@@ -153,7 +154,7 @@ static void __init check_config(void)
153 * If we configured ourselves for a TSC, we'd better have one! 154 * If we configured ourselves for a TSC, we'd better have one!
154 */ 155 */
155#ifdef CONFIG_X86_TSC 156#ifdef CONFIG_X86_TSC
156 if (!cpu_has_tsc && !tsc_disable) 157 if (!cpu_has_tsc)
157 panic("Kernel compiled for Pentium+, requires TSC feature!"); 158 panic("Kernel compiled for Pentium+, requires TSC feature!");
158#endif 159#endif
159 160
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2fcf2051bdb..db28aa9e2f69 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,43 +22,48 @@
22#include "cpu.h" 22#include "cpu.h"
23 23
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, 25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, 26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, 27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, 28 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
29 /* 29 /*
30 * Segments used for calling PnP BIOS have byte granularity. 30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits, 31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time. 32 * the transfer segment sizes are set at run time.
33 */ 33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 34 /* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ 35 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ 36 /* 16-bit code */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ 37 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ 38 /* 16-bit data */
39 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
40 /* 16-bit data */
41 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
42 /* 16-bit data */
43 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
39 /* 44 /*
40 * The APM segments have byte granularity and their bases 45 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits. 46 * are set at run time. All have 64k limits.
42 */ 47 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 48 /* 32-bit code */
49 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
44 /* 16-bit code */ 50 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, 51 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ 52 /* data */
53 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
47 54
48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, 55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, 56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
50} }; 57} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
52 59
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
61
53static int cachesize_override __cpuinitdata = -1; 62static int cachesize_override __cpuinitdata = -1;
54static int disable_x86_fxsr __cpuinitdata;
55static int disable_x86_serial_nr __cpuinitdata = 1; 63static int disable_x86_serial_nr __cpuinitdata = 1;
56static int disable_x86_sep __cpuinitdata;
57 64
58struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; 65struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
59 66
60extern int disable_pse;
61
62static void __cpuinit default_init(struct cpuinfo_x86 * c) 67static void __cpuinit default_init(struct cpuinfo_x86 * c)
63{ 68{
64 /* Not much we can do here... */ 69 /* Not much we can do here... */
@@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
207 212
208static int __init x86_fxsr_setup(char * s) 213static int __init x86_fxsr_setup(char * s)
209{ 214{
210 /* Tell all the other CPUs to not use it... */ 215 setup_clear_cpu_cap(X86_FEATURE_FXSR);
211 disable_x86_fxsr = 1; 216 setup_clear_cpu_cap(X86_FEATURE_XMM);
212
213 /*
214 * ... and clear the bits early in the boot_cpu_data
215 * so that the bootup process doesn't try to do this
216 * either.
217 */
218 clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
219 clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
220 return 1; 217 return 1;
221} 218}
222__setup("nofxsr", x86_fxsr_setup); 219__setup("nofxsr", x86_fxsr_setup);
@@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup);
224 221
225static int __init x86_sep_setup(char * s) 222static int __init x86_sep_setup(char * s)
226{ 223{
227 disable_x86_sep = 1; 224 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1; 225 return 1;
229} 226}
230__setup("nosep", x86_sep_setup); 227__setup("nosep", x86_sep_setup);
@@ -281,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; 278 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 } 279 }
283} 280}
281static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
282{
283 u32 tfms, xlvl;
284 int ebx;
285
286 memset(&c->x86_capability, 0, sizeof c->x86_capability);
287 if (have_cpuid_p()) {
288 /* Intel-defined flags: level 0x00000001 */
289 if (c->cpuid_level >= 0x00000001) {
290 u32 capability, excap;
291 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
292 c->x86_capability[0] = capability;
293 c->x86_capability[4] = excap;
294 }
295
296 /* AMD-defined flags: level 0x80000001 */
297 xlvl = cpuid_eax(0x80000000);
298 if ((xlvl & 0xffff0000) == 0x80000000) {
299 if (xlvl >= 0x80000001) {
300 c->x86_capability[1] = cpuid_edx(0x80000001);
301 c->x86_capability[6] = cpuid_ecx(0x80000001);
302 }
303 }
304
305 }
306
307}
284 308
285/* Do minimum CPU detection early. 309/* Do minimum CPU detection early.
286 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. 310 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
@@ -300,6 +324,17 @@ static void __init early_cpu_detect(void)
300 cpu_detect(c); 324 cpu_detect(c);
301 325
302 get_cpu_vendor(c, 1); 326 get_cpu_vendor(c, 1);
327
328 switch (c->x86_vendor) {
329 case X86_VENDOR_AMD:
330 early_init_amd(c);
331 break;
332 case X86_VENDOR_INTEL:
333 early_init_intel(c);
334 break;
335 }
336
337 early_get_cap(c);
303} 338}
304 339
305static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 340static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
@@ -357,8 +392,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
357 init_scattered_cpuid_features(c); 392 init_scattered_cpuid_features(c);
358 } 393 }
359 394
360 early_intel_workaround(c);
361
362#ifdef CONFIG_X86_HT 395#ifdef CONFIG_X86_HT
363 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 396 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
364#endif 397#endif
@@ -392,7 +425,7 @@ __setup("serialnumber", x86_serial_nr_setup);
392/* 425/*
393 * This does the hard work of actually picking apart the CPU stuff... 426 * This does the hard work of actually picking apart the CPU stuff...
394 */ 427 */
395static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 428void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
396{ 429{
397 int i; 430 int i;
398 431
@@ -418,20 +451,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
418 451
419 generic_identify(c); 452 generic_identify(c);
420 453
421 printk(KERN_DEBUG "CPU: After generic identify, caps:"); 454 if (this_cpu->c_identify)
422 for (i = 0; i < NCAPINTS; i++)
423 printk(" %08lx", c->x86_capability[i]);
424 printk("\n");
425
426 if (this_cpu->c_identify) {
427 this_cpu->c_identify(c); 455 this_cpu->c_identify(c);
428 456
429 printk(KERN_DEBUG "CPU: After vendor identify, caps:");
430 for (i = 0; i < NCAPINTS; i++)
431 printk(" %08lx", c->x86_capability[i]);
432 printk("\n");
433 }
434
435 /* 457 /*
436 * Vendor-specific initialization. In this section we 458 * Vendor-specific initialization. In this section we
437 * canonicalize the feature flags, meaning if there are 459 * canonicalize the feature flags, meaning if there are
@@ -453,23 +475,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
453 * we do "generic changes." 475 * we do "generic changes."
454 */ 476 */
455 477
456 /* TSC disabled? */
457 if ( tsc_disable )
458 clear_bit(X86_FEATURE_TSC, c->x86_capability);
459
460 /* FXSR disabled? */
461 if (disable_x86_fxsr) {
462 clear_bit(X86_FEATURE_FXSR, c->x86_capability);
463 clear_bit(X86_FEATURE_XMM, c->x86_capability);
464 }
465
466 /* SEP disabled? */
467 if (disable_x86_sep)
468 clear_bit(X86_FEATURE_SEP, c->x86_capability);
469
470 if (disable_pse)
471 clear_bit(X86_FEATURE_PSE, c->x86_capability);
472
473 /* If the model name is still unset, do table lookup. */ 478 /* If the model name is still unset, do table lookup. */
474 if ( !c->x86_model_id[0] ) { 479 if ( !c->x86_model_id[0] ) {
475 char *p; 480 char *p;
@@ -482,13 +487,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
482 c->x86, c->x86_model); 487 c->x86, c->x86_model);
483 } 488 }
484 489
485 /* Now the feature flags better reflect actual CPU features! */
486
487 printk(KERN_DEBUG "CPU: After all inits, caps:");
488 for (i = 0; i < NCAPINTS; i++)
489 printk(" %08lx", c->x86_capability[i]);
490 printk("\n");
491
492 /* 490 /*
493 * On SMP, boot_cpu_data holds the common feature set between 491 * On SMP, boot_cpu_data holds the common feature set between
494 * all CPUs; so make sure that we indicate which features are 492 * all CPUs; so make sure that we indicate which features are
@@ -501,8 +499,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
501 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 499 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
502 } 500 }
503 501
502 /* Clear all flags overriden by options */
503 for (i = 0; i < NCAPINTS; i++)
504 c->x86_capability[i] ^= cleared_cpu_caps[i];
505
504 /* Init Machine Check Exception if available. */ 506 /* Init Machine Check Exception if available. */
505 mcheck_init(c); 507 mcheck_init(c);
508
509 select_idle_routine(c);
506} 510}
507 511
508void __init identify_boot_cpu(void) 512void __init identify_boot_cpu(void)
@@ -510,7 +514,6 @@ void __init identify_boot_cpu(void)
510 identify_cpu(&boot_cpu_data); 514 identify_cpu(&boot_cpu_data);
511 sysenter_setup(); 515 sysenter_setup();
512 enable_sep_cpu(); 516 enable_sep_cpu();
513 mtrr_bp_init();
514} 517}
515 518
516void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 519void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -567,6 +570,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
567} 570}
568#endif 571#endif
569 572
573static __init int setup_noclflush(char *arg)
574{
575 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
576 return 1;
577}
578__setup("noclflush", setup_noclflush);
579
570void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 580void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
571{ 581{
572 char *vendor = NULL; 582 char *vendor = NULL;
@@ -590,6 +600,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 printk("\n"); 600 printk("\n");
591} 601}
592 602
603static __init int setup_disablecpuid(char *arg)
604{
605 int bit;
606 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
607 setup_clear_cpu_cap(bit);
608 else
609 return 0;
610 return 1;
611}
612__setup("clearcpuid=", setup_disablecpuid);
613
593cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 614cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
594 615
595/* This is hacky. :) 616/* This is hacky. :)
@@ -620,21 +641,13 @@ void __init early_cpu_init(void)
620 nexgen_init_cpu(); 641 nexgen_init_cpu();
621 umc_init_cpu(); 642 umc_init_cpu();
622 early_cpu_detect(); 643 early_cpu_detect();
623
624#ifdef CONFIG_DEBUG_PAGEALLOC
625 /* pse is not compatible with on-the-fly unmapping,
626 * disable it even if the cpus claim to support it.
627 */
628 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
629 disable_pse = 1;
630#endif
631} 644}
632 645
633/* Make sure %fs is initialized properly in idle threads */ 646/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 647struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
635{ 648{
636 memset(regs, 0, sizeof(struct pt_regs)); 649 memset(regs, 0, sizeof(struct pt_regs));
637 regs->xfs = __KERNEL_PERCPU; 650 regs->fs = __KERNEL_PERCPU;
638 return regs; 651 return regs;
639} 652}
640 653
@@ -642,7 +655,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
642 * it's on the real one. */ 655 * it's on the real one. */
643void switch_to_new_gdt(void) 656void switch_to_new_gdt(void)
644{ 657{
645 struct Xgt_desc_struct gdt_descr; 658 struct desc_ptr gdt_descr;
646 659
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 660 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1; 661 gdt_descr.size = GDT_SIZE - 1;
@@ -672,12 +685,6 @@ void __cpuinit cpu_init(void)
672 685
673 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 686 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
674 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 687 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
675 if (tsc_disable && cpu_has_tsc) {
676 printk(KERN_NOTICE "Disabling TSC...\n");
677 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
678 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
679 set_in_cr4(X86_CR4_TSD);
680 }
681 688
682 load_idt(&idt_descr); 689 load_idt(&idt_descr);
683 switch_to_new_gdt(); 690 switch_to_new_gdt();
@@ -691,7 +698,7 @@ void __cpuinit cpu_init(void)
691 BUG(); 698 BUG();
692 enter_lazy_tlb(&init_mm, curr); 699 enter_lazy_tlb(&init_mm, curr);
693 700
694 load_esp0(t, thread); 701 load_sp0(t, thread);
695 set_tss_desc(cpu,t); 702 set_tss_desc(cpu,t);
696 load_TR_desc(); 703 load_TR_desc();
697 load_LDT(&init_mm.context); 704 load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2f6432cef6ff..ad6527a5beb1 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
24extern int get_model_name(struct cpuinfo_x86 *c); 24extern int get_model_name(struct cpuinfo_x86 *c);
25extern void display_cacheinfo(struct cpuinfo_x86 *c); 25extern void display_cacheinfo(struct cpuinfo_x86 *c);
26 26
27extern void early_intel_workaround(struct cpuinfo_x86 *c); 27extern void early_init_intel(struct cpuinfo_x86 *c);
28extern void early_init_amd(struct cpuinfo_x86 *c);
28 29
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index fea0af0476b9..a962dcb9c408 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
67 unsigned int cpu_feature; 67 unsigned int cpu_feature;
68}; 68};
69 69
70static struct acpi_cpufreq_data *drv_data[NR_CPUS]; 70static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
71
71/* acpi_perf_data is a pointer to percpu data. */ 72/* acpi_perf_data is a pointer to percpu data. */
72static struct acpi_processor_performance *acpi_perf_data; 73static struct acpi_processor_performance *acpi_perf_data;
73 74
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
218 if (unlikely(cpus_empty(mask))) 219 if (unlikely(cpus_empty(mask)))
219 return 0; 220 return 0;
220 221
221 switch (drv_data[first_cpu(mask)]->cpu_feature) { 222 switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
222 case SYSTEM_INTEL_MSR_CAPABLE: 223 case SYSTEM_INTEL_MSR_CAPABLE:
223 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
224 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
225 break; 226 break;
226 case SYSTEM_IO_CAPABLE: 227 case SYSTEM_IO_CAPABLE:
227 cmd.type = SYSTEM_IO_CAPABLE; 228 cmd.type = SYSTEM_IO_CAPABLE;
228 perf = drv_data[first_cpu(mask)]->acpi_data; 229 perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
229 cmd.addr.io.port = perf->control_register.address; 230 cmd.addr.io.port = perf->control_register.address;
230 cmd.addr.io.bit_width = perf->control_register.bit_width; 231 cmd.addr.io.bit_width = perf->control_register.bit_width;
231 break; 232 break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
325 326
326#endif 327#endif
327 328
328 retval = drv_data[cpu]->max_freq * perf_percent / 100; 329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
329 330
330 put_cpu(); 331 put_cpu();
331 set_cpus_allowed(current, saved_mask); 332 set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
336 337
337static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 338static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
338{ 339{
339 struct acpi_cpufreq_data *data = drv_data[cpu]; 340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
340 unsigned int freq; 341 unsigned int freq;
341 342
342 dprintk("get_cur_freq_on_cpu (%d)\n", cpu); 343 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
370static int acpi_cpufreq_target(struct cpufreq_policy *policy, 371static int acpi_cpufreq_target(struct cpufreq_policy *policy,
371 unsigned int target_freq, unsigned int relation) 372 unsigned int target_freq, unsigned int relation)
372{ 373{
373 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 374 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
374 struct acpi_processor_performance *perf; 375 struct acpi_processor_performance *perf;
375 struct cpufreq_freqs freqs; 376 struct cpufreq_freqs freqs;
376 cpumask_t online_policy_cpus; 377 cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 467
467static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 468static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
468{ 469{
469 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 470 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
470 471
471 dprintk("acpi_cpufreq_verify\n"); 472 dprintk("acpi_cpufreq_verify\n");
472 473
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
570 return -ENOMEM; 571 return -ENOMEM;
571 572
572 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 573 data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
573 drv_data[cpu] = data; 574 per_cpu(drv_data, cpu) = data;
574 575
575 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 576 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
576 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 577 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
714 acpi_processor_unregister_performance(perf, cpu); 715 acpi_processor_unregister_performance(perf, cpu);
715err_free: 716err_free:
716 kfree(data); 717 kfree(data);
717 drv_data[cpu] = NULL; 718 per_cpu(drv_data, cpu) = NULL;
718 719
719 return result; 720 return result;
720} 721}
721 722
722static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 723static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
723{ 724{
724 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 725 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
725 726
726 dprintk("acpi_cpufreq_cpu_exit\n"); 727 dprintk("acpi_cpufreq_cpu_exit\n");
727 728
728 if (data) { 729 if (data) {
729 cpufreq_frequency_table_put_attr(policy->cpu); 730 cpufreq_frequency_table_put_attr(policy->cpu);
730 drv_data[policy->cpu] = NULL; 731 per_cpu(drv_data, policy->cpu) = NULL;
731 acpi_processor_unregister_performance(data->acpi_data, 732 acpi_processor_unregister_performance(data->acpi_data,
732 policy->cpu); 733 policy->cpu);
733 kfree(data); 734 kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
738 739
739static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 740static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
740{ 741{
741 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 742 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
742 743
743 dprintk("acpi_cpufreq_resume\n"); 744 dprintk("acpi_cpufreq_resume\n");
744 745
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 749d00cb2ebd..06fcce516d51 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 if ( acpi_bus_get_device(obj_handle, &d) ) { 694 if ( acpi_bus_get_device(obj_handle, &d) ) {
695 return 0; 695 return 0;
696 } 696 }
697 *return_value = (void *)acpi_driver_data(d); 697 *return_value = acpi_driver_data(d);
698 return 1; 698 return 1;
699} 699}
700 700
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 99e1ef9939be..a0522735dd9d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -52,7 +52,7 @@
52/* serialize freq changes */ 52/* serialize freq changes */
53static DEFINE_MUTEX(fidvid_mutex); 53static DEFINE_MUTEX(fidvid_mutex);
54 54
55static struct powernow_k8_data *powernow_data[NR_CPUS]; 55static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
56 56
57static int cpu_family = CPU_OPTERON; 57static int cpu_family = CPU_OPTERON;
58 58
@@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1018static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1018static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1019{ 1019{
1020 cpumask_t oldmask = CPU_MASK_ALL; 1020 cpumask_t oldmask = CPU_MASK_ALL;
1021 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1021 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1022 u32 checkfid; 1022 u32 checkfid;
1023 u32 checkvid; 1023 u32 checkvid;
1024 unsigned int newstate; 1024 unsigned int newstate;
@@ -1094,7 +1094,7 @@ err_out:
1094/* Driver entry point to verify the policy and range of frequencies */ 1094/* Driver entry point to verify the policy and range of frequencies */
1095static int powernowk8_verify(struct cpufreq_policy *pol) 1095static int powernowk8_verify(struct cpufreq_policy *pol)
1096{ 1096{
1097 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1097 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1098 1098
1099 if (!data) 1099 if (!data)
1100 return -EINVAL; 1100 return -EINVAL;
@@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1202 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", 1202 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1203 data->currfid, data->currvid); 1203 data->currfid, data->currvid);
1204 1204
1205 powernow_data[pol->cpu] = data; 1205 per_cpu(powernow_data, pol->cpu) = data;
1206 1206
1207 return 0; 1207 return 0;
1208 1208
@@ -1216,7 +1216,7 @@ err_out:
1216 1216
1217static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) 1217static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1218{ 1218{
1219 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1219 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1220 1220
1221 if (!data) 1221 if (!data)
1222 return -EINVAL; 1222 return -EINVAL;
@@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1237 cpumask_t oldmask = current->cpus_allowed; 1237 cpumask_t oldmask = current->cpus_allowed;
1238 unsigned int khz = 0; 1238 unsigned int khz = 0;
1239 1239
1240 data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; 1240 data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
1241 1241
1242 if (!data) 1242 if (!data)
1243 return -EINVAL; 1243 return -EINVAL;
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 88d66fb8411d..404a6a2d4016 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -5,6 +5,7 @@
5#include <asm/dma.h> 5#include <asm/dma.h>
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor-cyrix.h> 7#include <asm/processor-cyrix.h>
8#include <asm/processor-flags.h>
8#include <asm/timer.h> 9#include <asm/timer.h>
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void)
126 127
127static void __cpuinit set_cx86_memwb(void) 128static void __cpuinit set_cx86_memwb(void)
128{ 129{
129 u32 cr0;
130
131 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 130 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
132 131
133 /* CCR2 bit 2: unlock NW bit */ 132 /* CCR2 bit 2: unlock NW bit */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 133 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
135 /* set 'Not Write-through' */ 134 /* set 'Not Write-through' */
136 cr0 = 0x20000000; 135 write_cr0(read_cr0() | X86_CR0_NW);
137 write_cr0(read_cr0() | cr0);
138 /* CCR2 bit 2: lock NW bit and set WT1 */ 136 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); 137 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
140} 138}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cc8c501b9f39..d1c372b018db 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,6 +11,8 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h>
14 16
15#include "cpu.h" 17#include "cpu.h"
16 18
@@ -27,13 +29,14 @@
27struct movsl_mask movsl_mask __read_mostly; 29struct movsl_mask movsl_mask __read_mostly;
28#endif 30#endif
29 31
30void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) 32void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 33{
32 if (c->x86_vendor != X86_VENDOR_INTEL)
33 return;
34 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ 34 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
35 if (c->x86 == 15 && c->x86_cache_alignment == 64) 35 if (c->x86 == 15 && c->x86_cache_alignment == 64)
36 c->x86_cache_alignment = 128; 36 c->x86_cache_alignment = 128;
37 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
38 (c->x86 == 0x6 && c->x86_model >= 0x0e))
39 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
37} 40}
38 41
39/* 42/*
@@ -113,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
113 unsigned int l2 = 0; 116 unsigned int l2 = 0;
114 char *p = NULL; 117 char *p = NULL;
115 118
119 early_init_intel(c);
120
116#ifdef CONFIG_X86_F00F_BUG 121#ifdef CONFIG_X86_F00F_BUG
117 /* 122 /*
118 * All current models of Pentium and Pentium with MMX technology CPUs 123 * All current models of Pentium and Pentium with MMX technology CPUs
@@ -132,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
132 } 137 }
133#endif 138#endif
134 139
135 select_idle_routine(c);
136 l2 = init_intel_cacheinfo(c); 140 l2 = init_intel_cacheinfo(c);
137 if (c->cpuid_level > 9 ) { 141 if (c->cpuid_level > 9 ) {
138 unsigned eax = cpuid_eax(10); 142 unsigned eax = cpuid_eax(10);
@@ -201,16 +205,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
201 } 205 }
202#endif 206#endif
203 207
208 if (cpu_has_xmm2)
209 set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
204 if (c->x86 == 15) { 210 if (c->x86 == 15) {
205 set_bit(X86_FEATURE_P4, c->x86_capability); 211 set_bit(X86_FEATURE_P4, c->x86_capability);
206 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
207 } 212 }
208 if (c->x86 == 6) 213 if (c->x86 == 6)
209 set_bit(X86_FEATURE_P3, c->x86_capability); 214 set_bit(X86_FEATURE_P3, c->x86_capability);
210 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
211 (c->x86 == 0x6 && c->x86_model >= 0x0e))
212 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
213
214 if (cpu_has_ds) { 215 if (cpu_has_ds) {
215 unsigned int l1; 216 unsigned int l1;
216 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 217 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
@@ -219,6 +220,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
219 if (!(l1 & (1<<12))) 220 if (!(l1 & (1<<12)))
220 set_bit(X86_FEATURE_PEBS, c->x86_capability); 221 set_bit(X86_FEATURE_PEBS, c->x86_capability);
221 } 222 }
223
224 if (cpu_has_bts)
225 ds_init_intel(c);
222} 226}
223 227
224static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 228static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -342,5 +346,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
342EXPORT_SYMBOL(cmpxchg_386_u32); 346EXPORT_SYMBOL(cmpxchg_386_u32);
343#endif 347#endif
344 348
349#ifndef CONFIG_X86_CMPXCHG64
350unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
351{
352 u64 prev;
353 unsigned long flags;
354
355 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
356 local_irq_save(flags);
357 prev = *(u64 *)ptr;
358 if (prev == old)
359 *(u64 *)ptr = new;
360 local_irq_restore(flags);
361 return prev;
362}
363EXPORT_SYMBOL(cmpxchg_486_u64);
364#endif
365
345// arch_initcall(intel_cpu_init); 366// arch_initcall(intel_cpu_init);
346 367
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index eef63e3630c2..e633c9c2b764 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For AMD Athlon/Duron */ 18/* Machine Check Handler For AMD Athlon/Duron */
19static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) 19static void k7_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=1; i<nr_mce_banks; i++) { 33 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 35 if (high&(1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 /* Clear it */ 54 /* Clear it */
52 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
53 /* Serialize */ 56 /* Serialize */
54 wmb(); 57 wmb();
55 add_taint(TAINT_MACHINE_CHECK); 58 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 81fb6e2d35f3..ae9f628838f1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9 9
10/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 12
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 34c781eddee4..a5182dcd94ae 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -22,13 +22,13 @@ int nr_mce_banks;
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23 23
24/* Handle unconfigured int18 (should never happen) */ 24/* Handle unconfigured int18 (should never happen) */
25static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) 25static void unexpected_machine_check(struct pt_regs * regs, long error_code)
26{ 26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); 27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28} 28}
29 29
30/* Call the installed machine check handler for this CPU setup. */ 30/* Call the installed machine check handler for this CPU setup. */
31void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; 31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32 32
33/* This has to be run for each processor */ 33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c) 34void mcheck_init(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 242e8668dbeb..9a699ed03598 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
63 * separate MCEs from kernel messages to avoid bogus bug reports. 63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */ 64 */
65 65
66struct mce_log mcelog = { 66static struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE, 67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN, 68 MCE_LOG_LEN,
69}; 69};
@@ -80,7 +80,7 @@ void mce_log(struct mce *mce)
80 /* When the buffer fills up discard new entries. Assume 80 /* When the buffer fills up discard new entries. Assume
81 that the earlier errors are the more interesting. */ 81 that the earlier errors are the more interesting. */
82 if (entry >= MCE_LOG_LEN) { 82 if (entry >= MCE_LOG_LEN) {
83 set_bit(MCE_OVERFLOW, &mcelog.flags); 83 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
84 return; 84 return;
85 } 85 }
86 /* Old left over entry. Skip. */ 86 /* Old left over entry. Skip. */
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
110 KERN_EMERG 110 KERN_EMERG
111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
112 m->cpu, m->mcgstatus, m->bank, m->status); 112 m->cpu, m->mcgstatus, m->bank, m->status);
113 if (m->rip) { 113 if (m->ip) {
114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
116 m->cs, m->rip); 116 m->cs, m->ip);
117 if (m->cs == __KERNEL_CS) 117 if (m->cs == __KERNEL_CS)
118 print_symbol("{%s}", m->rip); 118 print_symbol("{%s}", m->ip);
119 printk("\n"); 119 printk("\n");
120 } 120 }
121 printk(KERN_EMERG "TSC %Lx ", m->tsc); 121 printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
157{ 157{
158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
159 m->rip = regs->rip; 159 m->ip = regs->ip;
160 m->cs = regs->cs; 160 m->cs = regs->cs;
161 } else { 161 } else {
162 m->rip = 0; 162 m->ip = 0;
163 m->cs = 0; 163 m->cs = 0;
164 } 164 }
165 if (rip_msr) { 165 if (rip_msr) {
166 /* Assume the RIP in the MSR is exact. Is this true? */ 166 /* Assume the RIP in the MSR is exact. Is this true? */
167 m->mcgstatus |= MCG_STATUS_EIPV; 167 m->mcgstatus |= MCG_STATUS_EIPV;
168 rdmsrl(rip_msr, m->rip); 168 rdmsrl(rip_msr, m->ip);
169 m->cs = 0; 169 m->cs = 0;
170 } 170 }
171} 171}
@@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
192 192
193 atomic_inc(&mce_entry); 193 atomic_inc(&mce_entry);
194 194
195 if (regs) 195 if ((regs
196 notify_die(DIE_NMI, "machine check", regs, error_code, 18, 196 && notify_die(DIE_NMI, "machine check", regs, error_code,
197 SIGKILL); 197 18, SIGKILL) == NOTIFY_STOP)
198 if (!banks) 198 || !banks)
199 goto out2; 199 goto out2;
200 200
201 memset(&m, 0, sizeof(struct mce)); 201 memset(&m, 0, sizeof(struct mce));
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
288 * instruction which caused the MCE. 288 * instruction which caused the MCE.
289 */ 289 */
290 if (m.mcgstatus & MCG_STATUS_EIPV) 290 if (m.mcgstatus & MCG_STATUS_EIPV)
291 user_space = panicm.rip && (panicm.cs & 3); 291 user_space = panicm.ip && (panicm.cs & 3);
292 292
293 /* 293 /*
294 * If we know that the error was in user space, send a 294 * If we know that the error was in user space, send a
@@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
564 loff_t *off) 564 loff_t *off)
565{ 565{
566 unsigned long *cpu_tsc; 566 unsigned long *cpu_tsc;
567 static DECLARE_MUTEX(mce_read_sem); 567 static DEFINE_MUTEX(mce_read_mutex);
568 unsigned next; 568 unsigned next;
569 char __user *buf = ubuf; 569 char __user *buf = ubuf;
570 int i, err; 570 int i, err;
@@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573 if (!cpu_tsc) 573 if (!cpu_tsc)
574 return -ENOMEM; 574 return -ENOMEM;
575 575
576 down(&mce_read_sem); 576 mutex_lock(&mce_read_mutex);
577 next = rcu_dereference(mcelog.next); 577 next = rcu_dereference(mcelog.next);
578 578
579 /* Only supports full reads right now */ 579 /* Only supports full reads right now */
580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
581 up(&mce_read_sem); 581 mutex_unlock(&mce_read_mutex);
582 kfree(cpu_tsc); 582 kfree(cpu_tsc);
583 return -EINVAL; 583 return -EINVAL;
584 } 584 }
@@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
621 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 621 memset(&mcelog.entry[i], 0, sizeof(struct mce));
622 } 622 }
623 } 623 }
624 up(&mce_read_sem); 624 mutex_unlock(&mce_read_mutex);
625 kfree(cpu_tsc); 625 kfree(cpu_tsc);
626 return err ? -EFAULT : buf - ubuf; 626 return err ? -EFAULT : buf - ubuf;
627} 627}
@@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
634 return 0; 634 return 0;
635} 635}
636 636
637static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, 637static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
638 unsigned long arg)
639{ 638{
640 int __user *p = (int __user *)arg; 639 int __user *p = (int __user *)arg;
641 640
@@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = {
664 .release = mce_release, 663 .release = mce_release,
665 .read = mce_read, 664 .read = mce_read,
666 .poll = mce_poll, 665 .poll = mce_poll,
667 .ioctl = mce_ioctl, 666 .unlocked_ioctl = mce_ioctl,
668}; 667};
669 668
670static struct miscdevice mce_log_device = { 669static struct miscdevice mce_log_device = {
@@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu)
855} 854}
856 855
857/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 856/* Get notified when a cpu comes on/off. Be hotplug friendly. */
858static int 857static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
859mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 858 unsigned long action, void *hcpu)
860{ 859{
861 unsigned int cpu = (unsigned long)hcpu; 860 unsigned int cpu = (unsigned long)hcpu;
862 861
@@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
873 return NOTIFY_OK; 872 return NOTIFY_OK;
874} 873}
875 874
876static struct notifier_block mce_cpu_notifier = { 875static struct notifier_block mce_cpu_notifier __cpuinitdata = {
877 .notifier_call = mce_cpu_callback, 876 .notifier_call = mce_cpu_callback,
878}; 877};
879 878
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 753588755fee..32671da8184e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
118{ 118{
119 unsigned int bank, block; 119 unsigned int bank, block;
120 unsigned int cpu = smp_processor_id(); 120 unsigned int cpu = smp_processor_id();
121 u8 lvt_off;
121 u32 low = 0, high = 0, address = 0; 122 u32 low = 0, high = 0, address = 0;
122 123
123 for (bank = 0; bank < NR_BANKS; ++bank) { 124 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
153 if (shared_bank[bank] && c->cpu_core_id) 154 if (shared_bank[bank] && c->cpu_core_id)
154 break; 155 break;
155#endif 156#endif
157 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
158 APIC_EILVT_MSG_FIX, 0);
159
156 high &= ~MASK_LVTOFF_HI; 160 high &= ~MASK_LVTOFF_HI;
157 high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; 161 high |= lvt_off << 20;
158 wrmsr(address, low, high); 162 wrmsr(address, low, high);
159 163
160 setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
161 THRESHOLD_APIC_VECTOR,
162 K8_APIC_EXT_INT_MSG_FIX, 0);
163
164 threshold_defaults.address = address; 164 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 165 threshold_restart_bank(&threshold_defaults, 0, 0);
166 } 166 }
@@ -450,7 +450,8 @@ recurse:
450 if (err) 450 if (err)
451 goto out_free; 451 goto out_free;
452 452
453 kobject_uevent(&b->kobj, KOBJ_ADD); 453 if (b)
454 kobject_uevent(&b->kobj, KOBJ_ADD);
454 455
455 return err; 456 return err;
456 457
@@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
554 int err = 0; 555 int err = 0;
555 556
556 for (bank = 0; bank < NR_BANKS; ++bank) { 557 for (bank = 0; bank < NR_BANKS; ++bank) {
557 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 558 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
558 continue; 559 continue;
559 err = threshold_create_bank(cpu, bank); 560 err = threshold_create_bank(cpu, bank);
560 if (err) 561 if (err)
@@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu)
637 unsigned int bank; 638 unsigned int bank;
638 639
639 for (bank = 0; bank < NR_BANKS; ++bank) { 640 for (bank = 0; bank < NR_BANKS; ++bank) {
640 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 641 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
641 continue; 642 continue;
642 threshold_remove_bank(cpu, bank); 643 threshold_remove_bank(cpu, bank);
643 } 644 }
644} 645}
645 646
646/* get notified when a cpu comes on/off */ 647/* get notified when a cpu comes on/off */
647static int threshold_cpu_callback(struct notifier_block *nfb, 648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
648 unsigned long action, void *hcpu) 649 unsigned long action, void *hcpu)
649{ 650{
650 /* cpu was unsigned int to begin with */ 651 /* cpu was unsigned int to begin with */
@@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
669 return NOTIFY_OK; 670 return NOTIFY_OK;
670} 671}
671 672
672static struct notifier_block threshold_cpu_notifier = { 673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
673 .notifier_call = threshold_cpu_callback, 674 .notifier_call = threshold_cpu_callback,
674}; 675};
675 676
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index be4dabfee1f5..cb03345554a5 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59 59
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 60void smp_thermal_interrupt(struct pt_regs *regs)
61{ 61{
62 irq_enter(); 62 irq_enter();
63 vendor_thermal_interrupt(regs); 63 vendor_thermal_interrupt(regs);
@@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
142} 142}
143 143
144static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 144static void intel_machine_check(struct pt_regs * regs, long error_code)
145{ 145{
146 int recover=1; 146 int recover=1;
147 u32 alow, ahigh, high, low; 147 u32 alow, ahigh, high, low;
@@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
152 if (mcgstl & (1<<0)) /* Recoverable ? */ 152 if (mcgstl & (1<<0)) /* Recoverable ? */
153 recover=0; 153 recover=0;
154 154
155 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
156 smp_processor_id(), mcgsth, mcgstl); 156 smp_processor_id(), mcgsth, mcgstl);
157 157
158 if (mce_num_extended_msrs > 0) { 158 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 159 struct intel_mce_extended_msrs dbg;
160 intel_get_extended_msrs(&dbg); 160 intel_get_extended_msrs(&dbg);
161 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", 161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 smp_processor_id(), dbg.eip, dbg.eflags); 162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", 163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
164 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); 164 smp_processor_id(), dbg.eip, dbg.eflags,
165 printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 165 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp); 166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
167 } 167 }
168 168
169 for (i=0; i<nr_mce_banks; i++) { 169 for (i = 0; i < nr_mce_banks; i++) {
170 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 170 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
171 if (high & (1<<31)) { 171 if (high & (1<<31)) {
172 char misc[20];
173 char addr[24];
174 misc[0] = addr[0] = '\0';
172 if (high & (1<<29)) 175 if (high & (1<<29))
173 recover |= 1; 176 recover |= 1;
174 if (high & (1<<25)) 177 if (high & (1<<25))
175 recover |= 2; 178 recover |= 2;
176 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
177 high &= ~(1<<31); 179 high &= ~(1<<31);
178 if (high & (1<<27)) { 180 if (high & (1<<27)) {
179 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 181 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
180 printk ("[%08x%08x]", ahigh, alow); 182 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
181 } 183 }
182 if (high & (1<<26)) { 184 if (high & (1<<26)) {
183 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 185 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
184 printk (" at %08x%08x", ahigh, alow); 186 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
185 } 187 }
186 printk ("\n"); 188 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
189 smp_processor_id(), i, high, low, misc, addr);
187 } 190 }
188 } 191 }
189 192
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 94bc43d950cf..a18310aaae0c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine check handler for Pentium class Intel */ 18/* Machine check handler for Pentium class Intel */
19static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) 19static void pentium_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 u32 loaddr, hi, lotype; 21 u32 loaddr, hi, lotype;
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index deeae42ce199..74342604d30e 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For PII/PIII */ 18/* Machine Check Handler For PII/PIII */
19static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 19static void intel_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=0; i<nr_mce_banks; i++) { 33 for (i = 0; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high & (1<<31)) { 35 if (high & (1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 } 54 }
52 } 55 }
53 56
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 9e424b6c293d..3d428d5afc52 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
15#include "mce.h" 15#include "mce.h"
16 16
17/* Machine check handler for WinChip C6 */ 17/* Machine check handler for WinChip C6 */
18static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) 18static void winchip_machine_check(struct pt_regs * regs, long error_code)
19{ 19{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
21 add_taint(TAINT_MACHINE_CHECK); 21 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 0949cdbf848a..ee2331b0e58f 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base,
53 <base> The base address of the region. 53 <base> The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled. 54 <size> The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region. 55 <type> The type of the region.
56 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
57 be done externally.
58 [RETURNS] Nothing. 56 [RETURNS] Nothing.
59*/ 57*/
60{ 58{
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9964be3de2b7..8e139c70f888 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -4,6 +4,7 @@
4#include <asm/msr.h> 4#include <asm/msr.h>
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9int arr3_protected; 10int arr3_protected;
@@ -142,7 +143,7 @@ static void prepare_set(void)
142 143
143 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 144 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
144 a side-effect */ 145 a side-effect */
145 cr0 = read_cr0() | 0x40000000; 146 cr0 = read_cr0() | X86_CR0_CD;
146 wbinvd(); 147 wbinvd();
147 write_cr0(cr0); 148 write_cr0(cr0);
148 wbinvd(); 149 wbinvd();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 992f08dfbb6c..103d61a59b19 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,11 +9,12 @@
9#include <asm/msr.h> 9#include <asm/msr.h>
10#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
12#include <asm/processor-flags.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
13#include "mtrr.h" 14#include "mtrr.h"
14 15
15struct mtrr_state { 16struct mtrr_state {
16 struct mtrr_var_range *var_ranges; 17 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
17 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 18 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
18 unsigned char enabled; 19 unsigned char enabled;
19 unsigned char have_fixed; 20 unsigned char have_fixed;
@@ -85,12 +86,6 @@ void __init get_mtrr_state(void)
85 struct mtrr_var_range *vrs; 86 struct mtrr_var_range *vrs;
86 unsigned lo, dummy; 87 unsigned lo, dummy;
87 88
88 if (!mtrr_state.var_ranges) {
89 mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
90 GFP_KERNEL);
91 if (!mtrr_state.var_ranges)
92 return;
93 }
94 vrs = mtrr_state.var_ranges; 89 vrs = mtrr_state.var_ranges;
95 90
96 rdmsr(MTRRcap_MSR, lo, dummy); 91 rdmsr(MTRRcap_MSR, lo, dummy);
@@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void)
188 * \param changed pointer which indicates whether the MTRR needed to be changed 183 * \param changed pointer which indicates whether the MTRR needed to be changed
189 * \param msrwords pointer to the MSR values which the MSR should have 184 * \param msrwords pointer to the MSR values which the MSR should have
190 */ 185 */
191static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) 186static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
192{ 187{
193 unsigned lo, hi; 188 unsigned lo, hi;
194 189
@@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
200 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 195 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
201 k8_enable_fixed_iorrs(); 196 k8_enable_fixed_iorrs();
202 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 197 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
203 *changed = TRUE; 198 *changed = true;
204 } 199 }
205} 200}
206 201
@@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
260static int set_fixed_ranges(mtrr_type * frs) 255static int set_fixed_ranges(mtrr_type * frs)
261{ 256{
262 unsigned long long *saved = (unsigned long long *) frs; 257 unsigned long long *saved = (unsigned long long *) frs;
263 int changed = FALSE; 258 bool changed = false;
264 int block=-1, range; 259 int block=-1, range;
265 260
266 while (fixed_range_blocks[++block].ranges) 261 while (fixed_range_blocks[++block].ranges)
@@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs)
273 268
274/* Set the MSR pair relating to a var range. Returns TRUE if 269/* Set the MSR pair relating to a var range. Returns TRUE if
275 changes are made */ 270 changes are made */
276static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) 271static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
277{ 272{
278 unsigned int lo, hi; 273 unsigned int lo, hi;
279 int changed = FALSE; 274 bool changed = false;
280 275
281 rdmsr(MTRRphysBase_MSR(index), lo, hi); 276 rdmsr(MTRRphysBase_MSR(index), lo, hi);
282 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) 277 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
283 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 278 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
284 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 279 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
285 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); 280 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
286 changed = TRUE; 281 changed = true;
287 } 282 }
288 283
289 rdmsr(MTRRphysMask_MSR(index), lo, hi); 284 rdmsr(MTRRphysMask_MSR(index), lo, hi);
@@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
292 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 287 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
293 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 288 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
294 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 289 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
295 changed = TRUE; 290 changed = true;
296 } 291 }
297 return changed; 292 return changed;
298} 293}
@@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
350 spin_lock(&set_atomicity_lock); 345 spin_lock(&set_atomicity_lock);
351 346
352 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 347 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
353 cr0 = read_cr0() | 0x40000000; /* set CD flag */ 348 cr0 = read_cr0() | X86_CR0_CD;
354 write_cr0(cr0); 349 write_cr0(cr0);
355 wbinvd(); 350 wbinvd();
356 351
@@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
417 <base> The base address of the region. 412 <base> The base address of the region.
418 <size> The size of the region. If this is 0 the region is disabled. 413 <size> The size of the region. If this is 0 the region is disabled.
419 <type> The type of the region. 414 <type> The type of the region.
420 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
421 be done externally.
422 [RETURNS] Nothing. 415 [RETURNS] Nothing.
423*/ 416*/
424{ 417{
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index c7d8f1756745..91e150acb46c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
11#include <asm/mtrr.h> 11#include <asm/mtrr.h>
12#include "mtrr.h" 12#include "mtrr.h"
13 13
14/* RED-PEN: this is accessed without any locking */
15extern unsigned int *usage_table;
16
17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 14#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 15
20static const char *const mtrr_strings[MTRR_NUM_TYPES] = 16static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x)
37 33
38static int 34static int
39mtrr_file_add(unsigned long base, unsigned long size, 35mtrr_file_add(unsigned long base, unsigned long size,
40 unsigned int type, char increment, struct file *file, int page) 36 unsigned int type, bool increment, struct file *file, int page)
41{ 37{
42 int reg, max; 38 int reg, max;
43 unsigned int *fcount = FILE_FCOUNT(file); 39 unsigned int *fcount = FILE_FCOUNT(file);
@@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
55 base >>= PAGE_SHIFT; 51 base >>= PAGE_SHIFT;
56 size >>= PAGE_SHIFT; 52 size >>= PAGE_SHIFT;
57 } 53 }
58 reg = mtrr_add_page(base, size, type, 1); 54 reg = mtrr_add_page(base, size, type, true);
59 if (reg >= 0) 55 if (reg >= 0)
60 ++fcount[reg]; 56 ++fcount[reg];
61 return reg; 57 return reg;
@@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
141 size >>= PAGE_SHIFT; 137 size >>= PAGE_SHIFT;
142 err = 138 err =
143 mtrr_add_page((unsigned long) base, (unsigned long) size, i, 139 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
144 1); 140 true);
145 if (err < 0) 141 if (err < 0)
146 return err; 142 return err;
147 return len; 143 return len;
@@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
217 if (!capable(CAP_SYS_ADMIN)) 213 if (!capable(CAP_SYS_ADMIN))
218 return -EPERM; 214 return -EPERM;
219 err = 215 err =
220 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 216 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
221 file, 0); 217 file, 0);
222 break; 218 break;
223 case MTRRIOC_SET_ENTRY: 219 case MTRRIOC_SET_ENTRY:
@@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
226#endif 222#endif
227 if (!capable(CAP_SYS_ADMIN)) 223 if (!capable(CAP_SYS_ADMIN))
228 return -EPERM; 224 return -EPERM;
229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 225 err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
230 break; 226 break;
231 case MTRRIOC_DEL_ENTRY: 227 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT 228#ifdef CONFIG_COMPAT
@@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
270 if (!capable(CAP_SYS_ADMIN)) 266 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM; 267 return -EPERM;
272 err = 268 err =
273 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 269 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
274 file, 1); 270 file, 1);
275 break; 271 break;
276 case MTRRIOC_SET_PAGE_ENTRY: 272 case MTRRIOC_SET_PAGE_ENTRY:
@@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
279#endif 275#endif
280 if (!capable(CAP_SYS_ADMIN)) 276 if (!capable(CAP_SYS_ADMIN))
281 return -EPERM; 277 return -EPERM;
282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 278 err =
279 mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
283 break; 280 break;
284 case MTRRIOC_DEL_PAGE_ENTRY: 281 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT 282#ifdef CONFIG_COMPAT
@@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
396 for (i = 0; i < max; i++) { 393 for (i = 0; i < max; i++) {
397 mtrr_if->get(i, &base, &size, &type); 394 mtrr_if->get(i, &base, &size, &type);
398 if (size == 0) 395 if (size == 0)
399 usage_table[i] = 0; 396 mtrr_usage_table[i] = 0;
400 else { 397 else {
401 if (size < (0x100000 >> PAGE_SHIFT)) { 398 if (size < (0x100000 >> PAGE_SHIFT)) {
402 /* less than 1MB */ 399 /* less than 1MB */
@@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
410 len += seq_printf(seq, 407 len += seq_printf(seq,
411 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
412 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
413 mtrr_attrib_to_str(type), usage_table[i]); 410 mtrr_attrib_to_str(type), mtrr_usage_table[i]);
414 } 411 }
415 } 412 }
416 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index beb45c9c0835..715919582657 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40 40
41#include <asm/e820.h>
41#include <asm/mtrr.h> 42#include <asm/mtrr.h>
42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
@@ -47,7 +47,7 @@
47 47
48u32 num_var_ranges = 0; 48u32 num_var_ranges = 0;
49 49
50unsigned int *usage_table; 50unsigned int mtrr_usage_table[MAX_VAR_RANGES];
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u64 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
@@ -121,13 +121,8 @@ static void __init init_table(void)
121 int i, max; 121 int i, max;
122 122
123 max = num_var_ranges; 123 max = num_var_ranges;
124 if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
125 == NULL) {
126 printk(KERN_ERR "mtrr: could not allocate\n");
127 return;
128 }
129 for (i = 0; i < max; i++) 124 for (i = 0; i < max; i++)
130 usage_table[i] = 1; 125 mtrr_usage_table[i] = 1;
131} 126}
132 127
133struct set_mtrr_data { 128struct set_mtrr_data {
@@ -311,7 +306,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
311 */ 306 */
312 307
313int mtrr_add_page(unsigned long base, unsigned long size, 308int mtrr_add_page(unsigned long base, unsigned long size,
314 unsigned int type, char increment) 309 unsigned int type, bool increment)
315{ 310{
316 int i, replace, error; 311 int i, replace, error;
317 mtrr_type ltype; 312 mtrr_type ltype;
@@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
383 goto out; 378 goto out;
384 } 379 }
385 if (increment) 380 if (increment)
386 ++usage_table[i]; 381 ++mtrr_usage_table[i];
387 error = i; 382 error = i;
388 goto out; 383 goto out;
389 } 384 }
@@ -391,13 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size,
391 i = mtrr_if->get_free_region(base, size, replace); 386 i = mtrr_if->get_free_region(base, size, replace);
392 if (i >= 0) { 387 if (i >= 0) {
393 set_mtrr(i, base, size, type); 388 set_mtrr(i, base, size, type);
394 if (likely(replace < 0)) 389 if (likely(replace < 0)) {
395 usage_table[i] = 1; 390 mtrr_usage_table[i] = 1;
396 else { 391 } else {
397 usage_table[i] = usage_table[replace] + !!increment; 392 mtrr_usage_table[i] = mtrr_usage_table[replace];
393 if (increment)
394 mtrr_usage_table[i]++;
398 if (unlikely(replace != i)) { 395 if (unlikely(replace != i)) {
399 set_mtrr(replace, 0, 0, 0); 396 set_mtrr(replace, 0, 0, 0);
400 usage_table[replace] = 0; 397 mtrr_usage_table[replace] = 0;
401 } 398 }
402 } 399 }
403 } else 400 } else
@@ -460,7 +457,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
460 457
461int 458int
462mtrr_add(unsigned long base, unsigned long size, unsigned int type, 459mtrr_add(unsigned long base, unsigned long size, unsigned int type,
463 char increment) 460 bool increment)
464{ 461{
465 if (mtrr_check(base, size)) 462 if (mtrr_check(base, size))
466 return -EINVAL; 463 return -EINVAL;
@@ -527,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
527 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); 524 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
528 goto out; 525 goto out;
529 } 526 }
530 if (usage_table[reg] < 1) { 527 if (mtrr_usage_table[reg] < 1) {
531 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); 528 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
532 goto out; 529 goto out;
533 } 530 }
534 if (--usage_table[reg] < 1) 531 if (--mtrr_usage_table[reg] < 1)
535 set_mtrr(reg, 0, 0, 0); 532 set_mtrr(reg, 0, 0, 0);
536 error = reg; 533 error = reg;
537 out: 534 out:
@@ -591,16 +588,11 @@ struct mtrr_value {
591 unsigned long lsize; 588 unsigned long lsize;
592}; 589};
593 590
594static struct mtrr_value * mtrr_state; 591static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
595 592
596static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 593static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
597{ 594{
598 int i; 595 int i;
599 int size = num_var_ranges * sizeof(struct mtrr_value);
600
601 mtrr_state = kzalloc(size,GFP_ATOMIC);
602 if (!mtrr_state)
603 return -ENOMEM;
604 596
605 for (i = 0; i < num_var_ranges; i++) { 597 for (i = 0; i < num_var_ranges; i++) {
606 mtrr_if->get(i, 598 mtrr_if->get(i,
@@ -622,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev)
622 mtrr_state[i].lsize, 614 mtrr_state[i].lsize,
623 mtrr_state[i].ltype); 615 mtrr_state[i].ltype);
624 } 616 }
625 kfree(mtrr_state);
626 return 0; 617 return 0;
627} 618}
628 619
@@ -633,6 +624,112 @@ static struct sysdev_driver mtrr_sysdev_driver = {
633 .resume = mtrr_restore, 624 .resume = mtrr_restore,
634}; 625};
635 626
627static int disable_mtrr_trim;
628
629static int __init disable_mtrr_trim_setup(char *str)
630{
631 disable_mtrr_trim = 1;
632 return 0;
633}
634early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
635
636/*
637 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
638 * for memory >4GB. Check for that here.
639 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
640 * apply to are wrong, but so far we don't know of any such case in the wild.
641 */
642#define Tom2Enabled (1U << 21)
643#define Tom2ForceMemTypeWB (1U << 22)
644
645static __init int amd_special_default_mtrr(void)
646{
647 u32 l, h;
648
649 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
650 return 0;
651 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
652 return 0;
653 /* In case some hypervisor doesn't pass SYSCFG through */
654 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
655 return 0;
656 /*
657 * Memory between 4GB and top of mem is forced WB by this magic bit.
658 * Reserved before K8RevF, but should be zero there.
659 */
660 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
661 (Tom2Enabled | Tom2ForceMemTypeWB))
662 return 1;
663 return 0;
664}
665
666/**
667 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
668 *
669 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
670 * memory configurations. This routine checks that the highest MTRR matches
671 * the end of memory, to make sure the MTRRs having a write back type cover
672 * all of the memory the kernel is intending to use. If not, it'll trim any
673 * memory off the end by adjusting end_pfn, removing it from the kernel's
674 * allocation pools, warning the user with an obnoxious message.
675 */
676int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
677{
678 unsigned long i, base, size, highest_addr = 0, def, dummy;
679 mtrr_type type;
680 u64 trim_start, trim_size;
681
682 /*
683 * Make sure we only trim uncachable memory on machines that
684 * support the Intel MTRR architecture:
685 */
686 if (!is_cpu(INTEL) || disable_mtrr_trim)
687 return 0;
688 rdmsr(MTRRdefType_MSR, def, dummy);
689 def &= 0xff;
690 if (def != MTRR_TYPE_UNCACHABLE)
691 return 0;
692
693 if (amd_special_default_mtrr())
694 return 0;
695
696 /* Find highest cached pfn */
697 for (i = 0; i < num_var_ranges; i++) {
698 mtrr_if->get(i, &base, &size, &type);
699 if (type != MTRR_TYPE_WRBACK)
700 continue;
701 base <<= PAGE_SHIFT;
702 size <<= PAGE_SHIFT;
703 if (highest_addr < base + size)
704 highest_addr = base + size;
705 }
706
707 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
708 if (!highest_addr) {
709 printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
710 WARN_ON(1);
711 return 0;
712 }
713
714 if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
715 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
716 " all of memory, losing %LdMB of RAM.\n",
717 (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20);
718
719 WARN_ON(1);
720
721 printk(KERN_INFO "update e820 for mtrr\n");
722 trim_start = highest_addr;
723 trim_size = end_pfn;
724 trim_size <<= PAGE_SHIFT;
725 trim_size -= trim_start;
726 add_memory_region(trim_start, trim_size, E820_RESERVED);
727 update_e820();
728 return 1;
729 }
730
731 return 0;
732}
636 733
637/** 734/**
638 * mtrr_bp_init - initialize mtrrs on the boot CPU 735 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 289dfe6030e3..fb74a2c20814 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -2,10 +2,8 @@
2 * local mtrr defines. 2 * local mtrr defines.
3 */ 3 */
4 4
5#ifndef TRUE 5#include <linux/types.h>
6#define TRUE 1 6#include <linux/stddef.h>
7#define FALSE 0
8#endif
9 7
10#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
11#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
@@ -14,6 +12,7 @@
14#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) 12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
15 13
16#define NUM_FIXED_RANGES 88 14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
17#define MTRRfix64K_00000_MSR 0x250 16#define MTRRfix64K_00000_MSR 0x250
18#define MTRRfix16K_80000_MSR 0x258 17#define MTRRfix16K_80000_MSR 0x258
19#define MTRRfix16K_A0000_MSR 0x259 18#define MTRRfix16K_A0000_MSR 0x259
@@ -34,6 +33,8 @@
34 an 8 bit field: */ 33 an 8 bit field: */
35typedef u8 mtrr_type; 34typedef u8 mtrr_type;
36 35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37
37struct mtrr_ops { 38struct mtrr_ops {
38 u32 vendor; 39 u32 vendor;
39 u32 use_intel_if; 40 u32 use_intel_if;
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 49e20c2afcdf..9f8ba923d1c9 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -4,6 +4,7 @@
4#include <asm/mtrr.h> 4#include <asm/mtrr.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9 10
@@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
25 26
26 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 27 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
27 a side-effect */ 28 a side-effect */
28 cr0 = read_cr0() | 0x40000000; 29 cr0 = read_cr0() | X86_CR0_CD;
29 wbinvd(); 30 wbinvd();
30 write_cr0(cr0); 31 write_cr0(cr0);
31 wbinvd(); 32 wbinvd();
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index c02541e6e653..9b838324b818 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr)
167 clear_bit(counter, evntsel_nmi_owner); 167 clear_bit(counter, evntsel_nmi_owner);
168} 168}
169 169
170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
171EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
172EXPORT_SYMBOL(reserve_perfctr_nmi); 171EXPORT_SYMBOL(reserve_perfctr_nmi);
173EXPORT_SYMBOL(release_perfctr_nmi); 172EXPORT_SYMBOL(release_perfctr_nmi);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3900e46d66db..028213260148 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
188static void c_stop(struct seq_file *m, void *v) 188static void c_stop(struct seq_file *m, void *v)
189{ 189{
190} 190}
191struct seq_operations cpuinfo_op = { 191const struct seq_operations cpuinfo_op = {
192 .start = c_start, 192 .start = c_start,
193 .next = c_next, 193 .next = c_next,
194 .stop = c_stop, 194 .stop = c_stop,
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index d387c770c518..dec66e452810 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -50,7 +50,7 @@ struct cpuid_command {
50 50
51static void cpuid_smp_cpuid(void *cmd_block) 51static void cpuid_smp_cpuid(void *cmd_block)
52{ 52{
53 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 53 struct cpuid_command *cmd = cmd_block;
54 54
55 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], 55 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
56 &cmd->data[3]); 56 &cmd->data[3]);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 40978af630e7..a47798b59f07 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
17 17
18static void doublefault_fn(void) 18static void doublefault_fn(void)
19{ 19{
20 struct Xgt_desc_struct gdt_desc = {0, 0}; 20 struct desc_ptr gdt_desc = {0, 0};
21 unsigned long gdt, tss; 21 unsigned long gdt, tss;
22 22
23 store_gdt(&gdt_desc); 23 store_gdt(&gdt_desc);
@@ -33,14 +33,15 @@ static void doublefault_fn(void)
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss); 33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34 34
35 if (ptr_ok(tss)) { 35 if (ptr_ok(tss)) {
36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss; 36 struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
37 37
38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); 38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
39 t->ip, t->sp);
39 40
40 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", 41 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx); 42 t->ax, t->bx, t->cx, t->dx);
42 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", 43 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi); 44 t->si, t->di);
44 } 45 }
45 } 46 }
46 47
@@ -50,15 +51,15 @@ static void doublefault_fn(void)
50 51
51struct tss_struct doublefault_tss __cacheline_aligned = { 52struct tss_struct doublefault_tss __cacheline_aligned = {
52 .x86_tss = { 53 .x86_tss = {
53 .esp0 = STACK_START, 54 .sp0 = STACK_START,
54 .ss0 = __KERNEL_DS, 55 .ss0 = __KERNEL_DS,
55 .ldt = 0, 56 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 57 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
57 58
58 .eip = (unsigned long) doublefault_fn, 59 .ip = (unsigned long) doublefault_fn,
59 /* 0x2 bit is always set */ 60 /* 0x2 bit is always set */
60 .eflags = X86_EFLAGS_SF | 0x2, 61 .flags = X86_EFLAGS_SF | 0x2,
61 .esp = STACK_START, 62 .sp = STACK_START,
62 .es = __USER_DS, 63 .es = __USER_DS,
63 .cs = __KERNEL_CS, 64 .cs = __KERNEL_CS,
64 .ss = __KERNEL_DS, 65 .ss = __KERNEL_DS,
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
new file mode 100644
index 000000000000..1c5ca4d18787
--- /dev/null
+++ b/arch/x86/kernel/ds.c
@@ -0,0 +1,464 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and
6 * precise-event based sampling (PEBS).
7 *
8 * Different architectures use a different DS layout/pointer size.
9 * The below functions therefore work on a void*.
10 *
11 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 *
15 *
16 * Copyright (C) 2007 Intel Corporation.
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */
19
20#include <asm/ds.h>
21
22#include <linux/errno.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25
26
27/*
28 * Debug Store (DS) save area configuration (see Intel64 and IA32
29 * Architectures Software Developer's Manual, section 18.5)
30 *
31 * The DS configuration consists of the following fields; different
32 * architetures vary in the size of those fields.
33 * - double-word aligned base linear address of the BTS buffer
34 * - write pointer into the BTS buffer
35 * - end linear address of the BTS buffer (one byte beyond the end of
36 * the buffer)
37 * - interrupt pointer into BTS buffer
38 * (interrupt occurs when write pointer passes interrupt pointer)
39 * - double-word aligned base linear address of the PEBS buffer
40 * - write pointer into the PEBS buffer
41 * - end linear address of the PEBS buffer (one byte beyond the end of
42 * the buffer)
43 * - interrupt pointer into PEBS buffer
44 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow
46 *
47 * On later architectures, the last branch recording hardware uses
48 * 64bit pointers even in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 *
59 *
60 * In order to abstract from the actual DS and BTS layout, we describe
61 * the access to the relevant fields.
62 * Thanks to Andi Kleen for proposing this design.
63 *
64 * The implementation, however, is not as general as it might seem. In
65 * order to stay somewhat simple and efficient, we assume an
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */
69
70/*
71 * A special from_ip address to indicate that the BTS record is an
72 * info record that needs to be interpreted or skipped.
73 */
74#define BTS_ESCAPE_ADDRESS (-1)
75
76/*
77 * A field access descriptor
78 */
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82};
83
84/*
85 * The configuration for a particular DS/BTS hardware implementation.
86 */
87struct ds_configuration {
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104
105/*
106 * The global configuration used by the below accessor functions
107 */
108static struct ds_configuration ds_cfg;
109
110/*
111 * Accessor functions for some DS and BTS fields using the above
112 * global ptrace_bts_cfg.
113 */
114static inline unsigned long get_bts_buffer_base(char *base)
115{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
117}
118static inline void set_bts_buffer_base(char *base, unsigned long value)
119{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
121}
122static inline unsigned long get_bts_index(char *base)
123{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset);
125}
126static inline void set_bts_index(char *base, unsigned long value)
127{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
129}
130static inline unsigned long get_bts_absolute_maximum(char *base)
131{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
133}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value)
135{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
137}
138static inline unsigned long get_bts_interrupt_threshold(char *base)
139{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
141}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
143{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
145}
146static inline unsigned long get_from_ip(char *base)
147{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset);
149}
150static inline void set_from_ip(char *base, unsigned long value)
151{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
153}
154static inline unsigned long get_to_ip(char *base)
155{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset);
157}
158static inline void set_to_ip(char *base, unsigned long value)
159{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
161}
162static inline unsigned char get_info_type(char *base)
163{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset);
165}
166static inline void set_info_type(char *base, unsigned char value)
167{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
169}
170static inline unsigned long get_info_data(char *base)
171{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset);
173}
174static inline void set_info_data(char *base, unsigned long value)
175{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
177}
178
179
180int ds_allocate(void **dsp, size_t bts_size_in_bytes)
181{
182 size_t bts_size_in_records;
183 unsigned long bts;
184 void *ds;
185
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
187 return -EOPNOTSUPP;
188
189 if (bts_size_in_bytes < 0)
190 return -EINVAL;
191
192 bts_size_in_records =
193 bts_size_in_bytes / ds_cfg.sizeof_bts;
194 bts_size_in_bytes =
195 bts_size_in_records * ds_cfg.sizeof_bts;
196
197 if (bts_size_in_bytes <= 0)
198 return -EINVAL;
199
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
201
202 if (!bts)
203 return -ENOMEM;
204
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
206
207 if (!ds) {
208 kfree((void *)bts);
209 return -ENOMEM;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216
217 *dsp = ds;
218 return 0;
219}
220
221int ds_free(void **dsp)
222{
223 if (*dsp)
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = 0;
227
228 return 0;
229}
230
231int ds_get_bts_size(void *ds)
232{
233 int size_in_bytes;
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245}
246
247int ds_get_bts_end(void *ds)
248{
249 int size_in_bytes = ds_get_bts_size(ds);
250
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253
254 return size_in_bytes / ds_cfg.sizeof_bts;
255}
256
257int ds_get_bts_index(void *ds)
258{
259 int index_offset_in_bytes;
260
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
262 return -EOPNOTSUPP;
263
264 index_offset_in_bytes =
265 get_bts_index(ds) -
266 get_bts_buffer_base(ds);
267
268 return index_offset_in_bytes / ds_cfg.sizeof_bts;
269}
270
271int ds_set_overflow(void *ds, int method)
272{
273 switch (method) {
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281}
282
283int ds_get_overflow(void *ds)
284{
285 return DS_O_WRAP;
286}
287
288int ds_clear(void *ds)
289{
290 int bts_size = ds_get_bts_size(ds);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301}
302
303int ds_read_bts(void *ds, int index, struct bts_struct *out)
304{
305 void *bts;
306
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
308 return -EOPNOTSUPP;
309
310 if (index < 0)
311 return -EINVAL;
312
313 if (index >= ds_get_bts_size(ds))
314 return -EINVAL;
315
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
317
318 memset(out, 0, sizeof(*out));
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327
328 return sizeof(*out);;
329}
330
331int ds_write_bts(void *ds, const struct bts_struct *in)
332{
333 unsigned long bts;
334
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340
341 bts = get_bts_index(ds);
342
343 memset((void *)bts, 0, ds_cfg.sizeof_bts);
344 switch (in->qualifier) {
345 case BTS_INVALID:
346 break;
347
348 case BTS_BRANCH:
349 set_from_ip((void *)bts, in->variant.lbr.from_ip);
350 set_to_ip((void *)bts, in->variant.lbr.to_ip);
351 break;
352
353 case BTS_TASK_ARRIVES:
354 case BTS_TASK_DEPARTS:
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
356 set_info_type((void *)bts, in->qualifier);
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359
360 default:
361 return -EINVAL;
362 }
363
364 bts = bts + ds_cfg.sizeof_bts;
365 if (bts >= get_bts_absolute_maximum(ds))
366 bts = get_bts_buffer_base(ds);
367 set_bts_index(ds, bts);
368
369 return ds_cfg.sizeof_bts;
370}
371
372unsigned long ds_debugctl_mask(void)
373{
374 return ds_cfg.debugctl_mask;
375}
376
377#ifdef __i386__
378static const struct ds_configuration ds_cfg_netburst = {
379 .sizeof_ds = 9 * 4,
380 .bts_buffer_base = { 0, 4 },
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391
392static const struct ds_configuration ds_cfg_pentium_m = {
393 .sizeof_ds = 9 * 4,
394 .bts_buffer_base = { 0, 4 },
395 .bts_index = { 4, 4 },
396 .bts_absolute_maximum = { 8, 4 },
397 .bts_interrupt_threshold = { 12, 4 },
398 .sizeof_bts = 3 * 4,
399 .from_ip = { 0, 4 },
400 .to_ip = { 4, 4 },
401 .info_type = { 4, 1 },
402 .info_data = { 8, 4 },
403 .debugctl_mask = (1<<6)|(1<<7)
404};
405#endif /* _i386_ */
406
407static const struct ds_configuration ds_cfg_core2 = {
408 .sizeof_ds = 9 * 8,
409 .bts_buffer_base = { 0, 8 },
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419};
420
421static inline void
422ds_configure(const struct ds_configuration *cfg)
423{
424 ds_cfg = *cfg;
425}
426
427void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
428{
429 switch (c->x86) {
430 case 0x6:
431 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD:
434 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m);
436 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2);
440 break;
441 default:
442 /* sorry, don't know about them */
443 break;
444 }
445 break;
446 case 0xF:
447 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0:
450 case 0x1:
451 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst);
453 break;
454#endif /* _i386_ */
455 default:
456 /* sorry, don't know about them */
457 break;
458 }
459 break;
460 default:
461 /* sorry, don't know about them */
462 break;
463 }
464}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 18f500d185a2..4e16ef4a2659 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -7,7 +7,6 @@
7#include <linux/kexec.h> 7#include <linux/kexec.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h> 10#include <linux/pfn.h>
12#include <linux/uaccess.h> 11#include <linux/uaccess.h>
13#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -17,11 +16,6 @@
17#include <asm/e820.h> 16#include <asm/e820.h>
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
20#ifdef CONFIG_EFI
21int efi_enabled = 0;
22EXPORT_SYMBOL(efi_enabled);
23#endif
24
25struct e820map e820; 19struct e820map e820;
26struct change_member { 20struct change_member {
27 struct e820entry *pbios; /* pointer to original bios entry */ 21 struct e820entry *pbios; /* pointer to original bios entry */
@@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000;
37EXPORT_SYMBOL(pci_mem_start); 31EXPORT_SYMBOL(pci_mem_start);
38#endif 32#endif
39extern int user_defined_memmap; 33extern int user_defined_memmap;
40struct resource data_resource = {
41 .name = "Kernel data",
42 .start = 0,
43 .end = 0,
44 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
45};
46
47struct resource code_resource = {
48 .name = "Kernel code",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52};
53
54struct resource bss_resource = {
55 .name = "Kernel bss",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
59};
60 34
61static struct resource system_rom_resource = { 35static struct resource system_rom_resource = {
62 .name = "System ROM", 36 .name = "System ROM",
@@ -111,60 +85,6 @@ static struct resource video_rom_resource = {
111 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
112}; 86};
113 87
114static struct resource video_ram_resource = {
115 .name = "Video RAM area",
116 .start = 0xa0000,
117 .end = 0xbffff,
118 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
119};
120
121static struct resource standard_io_resources[] = { {
122 .name = "dma1",
123 .start = 0x0000,
124 .end = 0x001f,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO
126}, {
127 .name = "pic1",
128 .start = 0x0020,
129 .end = 0x0021,
130 .flags = IORESOURCE_BUSY | IORESOURCE_IO
131}, {
132 .name = "timer0",
133 .start = 0x0040,
134 .end = 0x0043,
135 .flags = IORESOURCE_BUSY | IORESOURCE_IO
136}, {
137 .name = "timer1",
138 .start = 0x0050,
139 .end = 0x0053,
140 .flags = IORESOURCE_BUSY | IORESOURCE_IO
141}, {
142 .name = "keyboard",
143 .start = 0x0060,
144 .end = 0x006f,
145 .flags = IORESOURCE_BUSY | IORESOURCE_IO
146}, {
147 .name = "dma page reg",
148 .start = 0x0080,
149 .end = 0x008f,
150 .flags = IORESOURCE_BUSY | IORESOURCE_IO
151}, {
152 .name = "pic2",
153 .start = 0x00a0,
154 .end = 0x00a1,
155 .flags = IORESOURCE_BUSY | IORESOURCE_IO
156}, {
157 .name = "dma2",
158 .start = 0x00c0,
159 .end = 0x00df,
160 .flags = IORESOURCE_BUSY | IORESOURCE_IO
161}, {
162 .name = "fpu",
163 .start = 0x00f0,
164 .end = 0x00ff,
165 .flags = IORESOURCE_BUSY | IORESOURCE_IO
166} };
167
168#define ROMSIGNATURE 0xaa55 88#define ROMSIGNATURE 0xaa55
169 89
170static int __init romsignature(const unsigned char *rom) 90static int __init romsignature(const unsigned char *rom)
@@ -260,10 +180,9 @@ static void __init probe_roms(void)
260 * Request address space for all standard RAM and ROM resources 180 * Request address space for all standard RAM and ROM resources
261 * and also for regions reported as reserved by the e820. 181 * and also for regions reported as reserved by the e820.
262 */ 182 */
263static void __init 183void __init init_iomem_resources(struct resource *code_resource,
264legacy_init_iomem_resources(struct resource *code_resource, 184 struct resource *data_resource,
265 struct resource *data_resource, 185 struct resource *bss_resource)
266 struct resource *bss_resource)
267{ 186{
268 int i; 187 int i;
269 188
@@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource,
305 } 224 }
306} 225}
307 226
308/*
309 * Request address space for all standard resources
310 *
311 * This is called just before pcibios_init(), which is also a
312 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
313 */
314static int __init request_standard_resources(void)
315{
316 int i;
317
318 printk("Setting up standard PCI resources\n");
319 if (efi_enabled)
320 efi_initialize_iomem_resources(&code_resource,
321 &data_resource, &bss_resource);
322 else
323 legacy_init_iomem_resources(&code_resource,
324 &data_resource, &bss_resource);
325
326 /* EFI systems may still have VGA */
327 request_resource(&iomem_resource, &video_ram_resource);
328
329 /* request I/O space for devices used on all i[345]86 PCs */
330 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
331 request_resource(&ioport_resource, &standard_io_resources[i]);
332 return 0;
333}
334
335subsys_initcall(request_standard_resources);
336
337#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) 227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
338/** 228/**
339 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not 229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
@@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start,
370{ 260{
371 int x; 261 int x;
372 262
373 if (!efi_enabled) { 263 x = e820.nr_map;
374 x = e820.nr_map;
375
376 if (x == E820MAX) {
377 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
378 return;
379 }
380 264
381 e820.map[x].addr = start; 265 if (x == E820MAX) {
382 e820.map[x].size = size; 266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
383 e820.map[x].type = type; 267 return;
384 e820.nr_map++;
385 } 268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
386} /* add_memory_region */ 274} /* add_memory_region */
387 275
388/* 276/*
@@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
598} 486}
599 487
600/* 488/*
601 * Callback for efi_memory_walk.
602 */
603static int __init
604efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
605{
606 unsigned long *max_pfn = arg, pfn;
607
608 if (start < end) {
609 pfn = PFN_UP(end -1);
610 if (pfn > *max_pfn)
611 *max_pfn = pfn;
612 }
613 return 0;
614}
615
616static int __init
617efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
618{
619 memory_present(0, PFN_UP(start), PFN_DOWN(end));
620 return 0;
621}
622
623/*
624 * Find the highest page frame number we have available 489 * Find the highest page frame number we have available
625 */ 490 */
626void __init find_max_pfn(void) 491void __init find_max_pfn(void)
@@ -628,11 +493,6 @@ void __init find_max_pfn(void)
628 int i; 493 int i;
629 494
630 max_pfn = 0; 495 max_pfn = 0;
631 if (efi_enabled) {
632 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
633 efi_memmap_walk(efi_memory_present_wrapper, NULL);
634 return;
635 }
636 496
637 for (i = 0; i < e820.nr_map; i++) { 497 for (i = 0; i < e820.nr_map; i++) {
638 unsigned long start, end; 498 unsigned long start, end;
@@ -650,34 +510,12 @@ void __init find_max_pfn(void)
650} 510}
651 511
652/* 512/*
653 * Free all available memory for boot time allocation. Used
654 * as a callback function by efi_memory_walk()
655 */
656
657static int __init
658free_available_memory(unsigned long start, unsigned long end, void *arg)
659{
660 /* check max_low_pfn */
661 if (start >= (max_low_pfn << PAGE_SHIFT))
662 return 0;
663 if (end >= (max_low_pfn << PAGE_SHIFT))
664 end = max_low_pfn << PAGE_SHIFT;
665 if (start < end)
666 free_bootmem(start, end - start);
667
668 return 0;
669}
670/*
671 * Register fully available low RAM pages with the bootmem allocator. 513 * Register fully available low RAM pages with the bootmem allocator.
672 */ 514 */
673void __init register_bootmem_low_pages(unsigned long max_low_pfn) 515void __init register_bootmem_low_pages(unsigned long max_low_pfn)
674{ 516{
675 int i; 517 int i;
676 518
677 if (efi_enabled) {
678 efi_memmap_walk(free_available_memory, NULL);
679 return;
680 }
681 for (i = 0; i < e820.nr_map; i++) { 519 for (i = 0; i < e820.nr_map; i++) {
682 unsigned long curr_pfn, last_pfn, size; 520 unsigned long curr_pfn, last_pfn, size;
683 /* 521 /*
@@ -785,56 +623,12 @@ void __init print_memory_map(char *who)
785 } 623 }
786} 624}
787 625
788static __init __always_inline void efi_limit_regions(unsigned long long size)
789{
790 unsigned long long current_addr = 0;
791 efi_memory_desc_t *md, *next_md;
792 void *p, *p1;
793 int i, j;
794
795 j = 0;
796 p1 = memmap.map;
797 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
798 md = p;
799 next_md = p1;
800 current_addr = md->phys_addr +
801 PFN_PHYS(md->num_pages);
802 if (is_available_memory(md)) {
803 if (md->phys_addr >= size) continue;
804 memcpy(next_md, md, memmap.desc_size);
805 if (current_addr >= size) {
806 next_md->num_pages -=
807 PFN_UP(current_addr-size);
808 }
809 p1 += memmap.desc_size;
810 next_md = p1;
811 j++;
812 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
813 EFI_MEMORY_RUNTIME) {
814 /* In order to make runtime services
815 * available we have to include runtime
816 * memory regions in memory map */
817 memcpy(next_md, md, memmap.desc_size);
818 p1 += memmap.desc_size;
819 next_md = p1;
820 j++;
821 }
822 }
823 memmap.nr_map = j;
824 memmap.map_end = memmap.map +
825 (memmap.nr_map * memmap.desc_size);
826}
827
828void __init limit_regions(unsigned long long size) 626void __init limit_regions(unsigned long long size)
829{ 627{
830 unsigned long long current_addr; 628 unsigned long long current_addr;
831 int i; 629 int i;
832 630
833 print_memory_map("limit_regions start"); 631 print_memory_map("limit_regions start");
834 if (efi_enabled) {
835 efi_limit_regions(size);
836 return;
837 }
838 for (i = 0; i < e820.nr_map; i++) { 632 for (i = 0; i < e820.nr_map; i++) {
839 current_addr = e820.map[i].addr + e820.map[i].size; 633 current_addr = e820.map[i].addr + e820.map[i].size;
840 if (current_addr < size) 634 if (current_addr < size)
@@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg)
955 return 0; 749 return 0;
956} 750}
957early_param("memmap", parse_memmap); 751early_param("memmap", parse_memmap);
752void __init update_e820(void)
753{
754 u8 nr_map;
755
756 nr_map = e820.nr_map;
757 if (sanitize_e820_map(e820.map, &nr_map))
758 return;
759 e820.nr_map = nr_map;
760 printk(KERN_INFO "modified physical RAM map:\n");
761 print_memory_map("modified");
762}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 04698e0b056c..c617174e8963 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -1,4 +1,4 @@
1/* 1/*
2 * Handle the memory map. 2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over. 3 * The functions here do the job until bootmem takes over.
4 * 4 *
@@ -26,80 +26,87 @@
26#include <asm/proto.h> 26#include <asm/proto.h>
27#include <asm/setup.h> 27#include <asm/setup.h>
28#include <asm/sections.h> 28#include <asm/sections.h>
29#include <asm/kdebug.h>
29 30
30struct e820map e820; 31struct e820map e820;
31 32
32/* 33/*
33 * PFN of last memory page. 34 * PFN of last memory page.
34 */ 35 */
35unsigned long end_pfn; 36unsigned long end_pfn;
36EXPORT_SYMBOL(end_pfn);
37 37
38/* 38/*
39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. 39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
40 * The direct mapping extends to end_pfn_map, so that we can directly access 40 * The direct mapping extends to end_pfn_map, so that we can directly access
41 * apertures, ACPI and other tables without having to play with fixmaps. 41 * apertures, ACPI and other tables without having to play with fixmaps.
42 */ 42 */
43unsigned long end_pfn_map; 43unsigned long end_pfn_map;
44 44
45/* 45/*
46 * Last pfn which the user wants to use. 46 * Last pfn which the user wants to use.
47 */ 47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; 48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49 49
50extern struct resource code_resource, data_resource, bss_resource; 50/*
51 51 * Early reserved memory areas.
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 52 */
53static inline int bad_addr(unsigned long *addrp, unsigned long size) 53#define MAX_EARLY_RES 20
54{ 54
55 unsigned long addr = *addrp, last = addr + size; 55struct early_res {
56 56 unsigned long start, end;
57 /* various gunk below that needed for SMP startup */ 57};
58 if (addr < 0x8000) { 58static struct early_res early_res[MAX_EARLY_RES] __initdata = {
59 *addrp = PAGE_ALIGN(0x8000); 59 { 0, PAGE_SIZE }, /* BIOS data page */
60 return 1; 60#ifdef CONFIG_SMP
61 } 61 { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE },
62
63 /* direct mapping tables of the kernel */
64 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
65 *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
66 return 1;
67 }
68
69 /* initrd */
70#ifdef CONFIG_BLK_DEV_INITRD
71 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
72 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
73 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
74 unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
75
76 if (last >= ramdisk_image && addr < ramdisk_end) {
77 *addrp = PAGE_ALIGN(ramdisk_end);
78 return 1;
79 }
80 }
81#endif 62#endif
82 /* kernel code */ 63 {}
83 if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { 64};
84 *addrp = PAGE_ALIGN(__pa_symbol(&_end)); 65
85 return 1; 66void __init reserve_early(unsigned long start, unsigned long end)
67{
68 int i;
69 struct early_res *r;
70 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
71 r = &early_res[i];
72 if (end > r->start && start < r->end)
73 panic("Overlapping early reservations %lx-%lx to %lx-%lx\n",
74 start, end, r->start, r->end);
86 } 75 }
76 if (i >= MAX_EARLY_RES)
77 panic("Too many early reservations");
78 r = &early_res[i];
79 r->start = start;
80 r->end = end;
81}
87 82
88 if (last >= ebda_addr && addr < ebda_addr + ebda_size) { 83void __init early_res_to_bootmem(void)
89 *addrp = PAGE_ALIGN(ebda_addr + ebda_size); 84{
90 return 1; 85 int i;
86 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
87 struct early_res *r = &early_res[i];
88 reserve_bootmem_generic(r->start, r->end - r->start);
91 } 89 }
90}
92 91
93#ifdef CONFIG_NUMA 92/* Check for already reserved areas */
94 /* NUMA memory to node map */ 93static inline int bad_addr(unsigned long *addrp, unsigned long size)
95 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { 94{
96 *addrp = nodemap_addr + nodemap_size; 95 int i;
97 return 1; 96 unsigned long addr = *addrp, last;
97 int changed = 0;
98again:
99 last = addr + size;
100 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
101 struct early_res *r = &early_res[i];
102 if (last >= r->start && addr < r->end) {
103 *addrp = addr = r->end;
104 changed = 1;
105 goto again;
106 }
98 } 107 }
99#endif 108 return changed;
100 /* XXX ramdisk image here? */ 109}
101 return 0;
102}
103 110
104/* 111/*
105 * This function checks if any part of the range <start,end> is mapped 112 * This function checks if any part of the range <start,end> is mapped
@@ -107,16 +114,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
107 */ 114 */
108int 115int
109e820_any_mapped(unsigned long start, unsigned long end, unsigned type) 116e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
110{ 117{
111 int i; 118 int i;
112 for (i = 0; i < e820.nr_map; i++) { 119
113 struct e820entry *ei = &e820.map[i]; 120 for (i = 0; i < e820.nr_map; i++) {
114 if (type && ei->type != type) 121 struct e820entry *ei = &e820.map[i];
122
123 if (type && ei->type != type)
115 continue; 124 continue;
116 if (ei->addr >= end || ei->addr + ei->size <= start) 125 if (ei->addr >= end || ei->addr + ei->size <= start)
117 continue; 126 continue;
118 return 1; 127 return 1;
119 } 128 }
120 return 0; 129 return 0;
121} 130}
122EXPORT_SYMBOL_GPL(e820_any_mapped); 131EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -127,11 +136,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
127 * Note: this function only works correct if the e820 table is sorted and 136 * Note: this function only works correct if the e820 table is sorted and
128 * not-overlapping, which is the case 137 * not-overlapping, which is the case
129 */ 138 */
130int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) 139int __init e820_all_mapped(unsigned long start, unsigned long end,
140 unsigned type)
131{ 141{
132 int i; 142 int i;
143
133 for (i = 0; i < e820.nr_map; i++) { 144 for (i = 0; i < e820.nr_map; i++) {
134 struct e820entry *ei = &e820.map[i]; 145 struct e820entry *ei = &e820.map[i];
146
135 if (type && ei->type != type) 147 if (type && ei->type != type)
136 continue; 148 continue;
137 /* is the region (part) in overlap with the current region ?*/ 149 /* is the region (part) in overlap with the current region ?*/
@@ -143,65 +155,73 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type
143 */ 155 */
144 if (ei->addr <= start) 156 if (ei->addr <= start)
145 start = ei->addr + ei->size; 157 start = ei->addr + ei->size;
146 /* if start is now at or beyond end, we're done, full coverage */ 158 /*
159 * if start is now at or beyond end, we're done, full
160 * coverage
161 */
147 if (start >= end) 162 if (start >= end)
148 return 1; /* we're done */ 163 return 1;
149 } 164 }
150 return 0; 165 return 0;
151} 166}
152 167
153/* 168/*
154 * Find a free area in a specific range. 169 * Find a free area in a specific range.
155 */ 170 */
156unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 171unsigned long __init find_e820_area(unsigned long start, unsigned long end,
157{ 172 unsigned size)
158 int i; 173{
159 for (i = 0; i < e820.nr_map; i++) { 174 int i;
160 struct e820entry *ei = &e820.map[i]; 175
161 unsigned long addr = ei->addr, last; 176 for (i = 0; i < e820.nr_map; i++) {
162 if (ei->type != E820_RAM) 177 struct e820entry *ei = &e820.map[i];
163 continue; 178 unsigned long addr = ei->addr, last;
164 if (addr < start) 179
180 if (ei->type != E820_RAM)
181 continue;
182 if (addr < start)
165 addr = start; 183 addr = start;
166 if (addr > ei->addr + ei->size) 184 if (addr > ei->addr + ei->size)
167 continue; 185 continue;
168 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) 186 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
169 ; 187 ;
170 last = PAGE_ALIGN(addr) + size; 188 last = PAGE_ALIGN(addr) + size;
171 if (last > ei->addr + ei->size) 189 if (last > ei->addr + ei->size)
172 continue; 190 continue;
173 if (last > end) 191 if (last > end)
174 continue; 192 continue;
175 return addr; 193 return addr;
176 } 194 }
177 return -1UL; 195 return -1UL;
178} 196}
179 197
180/* 198/*
181 * Find the highest page frame number we have available 199 * Find the highest page frame number we have available
182 */ 200 */
183unsigned long __init e820_end_of_ram(void) 201unsigned long __init e820_end_of_ram(void)
184{ 202{
185 unsigned long end_pfn = 0; 203 unsigned long end_pfn;
204
186 end_pfn = find_max_pfn_with_active_regions(); 205 end_pfn = find_max_pfn_with_active_regions();
187 206
188 if (end_pfn > end_pfn_map) 207 if (end_pfn > end_pfn_map)
189 end_pfn_map = end_pfn; 208 end_pfn_map = end_pfn;
190 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 209 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
191 end_pfn_map = MAXMEM>>PAGE_SHIFT; 210 end_pfn_map = MAXMEM>>PAGE_SHIFT;
192 if (end_pfn > end_user_pfn) 211 if (end_pfn > end_user_pfn)
193 end_pfn = end_user_pfn; 212 end_pfn = end_user_pfn;
194 if (end_pfn > end_pfn_map) 213 if (end_pfn > end_pfn_map)
195 end_pfn = end_pfn_map; 214 end_pfn = end_pfn_map;
196 215
197 printk("end_pfn_map = %lu\n", end_pfn_map); 216 printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
198 return end_pfn; 217 return end_pfn;
199} 218}
200 219
201/* 220/*
202 * Mark e820 reserved areas as busy for the resource manager. 221 * Mark e820 reserved areas as busy for the resource manager.
203 */ 222 */
204void __init e820_reserve_resources(void) 223void __init e820_reserve_resources(struct resource *code_resource,
224 struct resource *data_resource, struct resource *bss_resource)
205{ 225{
206 int i; 226 int i;
207 for (i = 0; i < e820.nr_map; i++) { 227 for (i = 0; i < e820.nr_map; i++) {
@@ -219,13 +239,13 @@ void __init e820_reserve_resources(void)
219 request_resource(&iomem_resource, res); 239 request_resource(&iomem_resource, res);
220 if (e820.map[i].type == E820_RAM) { 240 if (e820.map[i].type == E820_RAM) {
221 /* 241 /*
222 * We don't know which RAM region contains kernel data, 242 * We don't know which RAM region contains kernel data,
223 * so we try it repeatedly and let the resource manager 243 * so we try it repeatedly and let the resource manager
224 * test it. 244 * test it.
225 */ 245 */
226 request_resource(res, &code_resource); 246 request_resource(res, code_resource);
227 request_resource(res, &data_resource); 247 request_resource(res, data_resource);
228 request_resource(res, &bss_resource); 248 request_resource(res, bss_resource);
229#ifdef CONFIG_KEXEC 249#ifdef CONFIG_KEXEC
230 if (crashk_res.start != crashk_res.end) 250 if (crashk_res.start != crashk_res.end)
231 request_resource(res, &crashk_res); 251 request_resource(res, &crashk_res);
@@ -322,9 +342,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
322 add_active_range(nid, ei_startpfn, ei_endpfn); 342 add_active_range(nid, ei_startpfn, ei_endpfn);
323} 343}
324 344
325/* 345/*
326 * Add a memory region to the kernel e820 map. 346 * Add a memory region to the kernel e820 map.
327 */ 347 */
328void __init add_memory_region(unsigned long start, unsigned long size, int type) 348void __init add_memory_region(unsigned long start, unsigned long size, int type)
329{ 349{
330 int x = e820.nr_map; 350 int x = e820.nr_map;
@@ -349,9 +369,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
349{ 369{
350 unsigned long start_pfn = start >> PAGE_SHIFT; 370 unsigned long start_pfn = start >> PAGE_SHIFT;
351 unsigned long end_pfn = end >> PAGE_SHIFT; 371 unsigned long end_pfn = end >> PAGE_SHIFT;
352 unsigned long ei_startpfn; 372 unsigned long ei_startpfn, ei_endpfn, ram = 0;
353 unsigned long ei_endpfn;
354 unsigned long ram = 0;
355 int i; 373 int i;
356 374
357 for (i = 0; i < e820.nr_map; i++) { 375 for (i = 0; i < e820.nr_map; i++) {
@@ -363,28 +381,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
363 return end - start - (ram << PAGE_SHIFT); 381 return end - start - (ram << PAGE_SHIFT);
364} 382}
365 383
366void __init e820_print_map(char *who) 384static void __init e820_print_map(char *who)
367{ 385{
368 int i; 386 int i;
369 387
370 for (i = 0; i < e820.nr_map; i++) { 388 for (i = 0; i < e820.nr_map; i++) {
371 printk(KERN_INFO " %s: %016Lx - %016Lx ", who, 389 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
372 (unsigned long long) e820.map[i].addr, 390 (unsigned long long) e820.map[i].addr,
373 (unsigned long long) (e820.map[i].addr + e820.map[i].size)); 391 (unsigned long long)
392 (e820.map[i].addr + e820.map[i].size));
374 switch (e820.map[i].type) { 393 switch (e820.map[i].type) {
375 case E820_RAM: printk("(usable)\n"); 394 case E820_RAM:
376 break; 395 printk(KERN_CONT "(usable)\n");
396 break;
377 case E820_RESERVED: 397 case E820_RESERVED:
378 printk("(reserved)\n"); 398 printk(KERN_CONT "(reserved)\n");
379 break; 399 break;
380 case E820_ACPI: 400 case E820_ACPI:
381 printk("(ACPI data)\n"); 401 printk(KERN_CONT "(ACPI data)\n");
382 break; 402 break;
383 case E820_NVS: 403 case E820_NVS:
384 printk("(ACPI NVS)\n"); 404 printk(KERN_CONT "(ACPI NVS)\n");
385 break; 405 break;
386 default: printk("type %u\n", e820.map[i].type); 406 default:
387 break; 407 printk(KERN_CONT "type %u\n", e820.map[i].type);
408 break;
388 } 409 }
389 } 410 }
390} 411}
@@ -392,11 +413,11 @@ void __init e820_print_map(char *who)
392/* 413/*
393 * Sanitize the BIOS e820 map. 414 * Sanitize the BIOS e820 map.
394 * 415 *
395 * Some e820 responses include overlapping entries. The following 416 * Some e820 responses include overlapping entries. The following
396 * replaces the original e820 map with a new one, removing overlaps. 417 * replaces the original e820 map with a new one, removing overlaps.
397 * 418 *
398 */ 419 */
399static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) 420static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
400{ 421{
401 struct change_member { 422 struct change_member {
402 struct e820entry *pbios; /* pointer to original bios entry */ 423 struct e820entry *pbios; /* pointer to original bios entry */
@@ -416,7 +437,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
416 int i; 437 int i;
417 438
418 /* 439 /*
419 Visually we're performing the following (1,2,3,4 = memory types)... 440 Visually we're performing the following
441 (1,2,3,4 = memory types)...
420 442
421 Sample memory map (w/overlaps): 443 Sample memory map (w/overlaps):
422 ____22__________________ 444 ____22__________________
@@ -458,22 +480,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
458 old_nr = *pnr_map; 480 old_nr = *pnr_map;
459 481
460 /* bail out if we find any unreasonable addresses in bios map */ 482 /* bail out if we find any unreasonable addresses in bios map */
461 for (i=0; i<old_nr; i++) 483 for (i = 0; i < old_nr; i++)
462 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) 484 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
463 return -1; 485 return -1;
464 486
465 /* create pointers for initial change-point information (for sorting) */ 487 /* create pointers for initial change-point information (for sorting) */
466 for (i=0; i < 2*old_nr; i++) 488 for (i = 0; i < 2 * old_nr; i++)
467 change_point[i] = &change_point_list[i]; 489 change_point[i] = &change_point_list[i];
468 490
469 /* record all known change-points (starting and ending addresses), 491 /* record all known change-points (starting and ending addresses),
470 omitting those that are for empty memory regions */ 492 omitting those that are for empty memory regions */
471 chgidx = 0; 493 chgidx = 0;
472 for (i=0; i < old_nr; i++) { 494 for (i = 0; i < old_nr; i++) {
473 if (biosmap[i].size != 0) { 495 if (biosmap[i].size != 0) {
474 change_point[chgidx]->addr = biosmap[i].addr; 496 change_point[chgidx]->addr = biosmap[i].addr;
475 change_point[chgidx++]->pbios = &biosmap[i]; 497 change_point[chgidx++]->pbios = &biosmap[i];
476 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; 498 change_point[chgidx]->addr = biosmap[i].addr +
499 biosmap[i].size;
477 change_point[chgidx++]->pbios = &biosmap[i]; 500 change_point[chgidx++]->pbios = &biosmap[i];
478 } 501 }
479 } 502 }
@@ -483,75 +506,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
483 still_changing = 1; 506 still_changing = 1;
484 while (still_changing) { 507 while (still_changing) {
485 still_changing = 0; 508 still_changing = 0;
486 for (i=1; i < chg_nr; i++) { 509 for (i = 1; i < chg_nr; i++) {
487 /* if <current_addr> > <last_addr>, swap */ 510 unsigned long long curaddr, lastaddr;
488 /* or, if current=<start_addr> & last=<end_addr>, swap */ 511 unsigned long long curpbaddr, lastpbaddr;
489 if ((change_point[i]->addr < change_point[i-1]->addr) || 512
490 ((change_point[i]->addr == change_point[i-1]->addr) && 513 curaddr = change_point[i]->addr;
491 (change_point[i]->addr == change_point[i]->pbios->addr) && 514 lastaddr = change_point[i - 1]->addr;
492 (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) 515 curpbaddr = change_point[i]->pbios->addr;
493 ) 516 lastpbaddr = change_point[i - 1]->pbios->addr;
494 { 517
518 /*
519 * swap entries, when:
520 *
521 * curaddr > lastaddr or
522 * curaddr == lastaddr and curaddr == curpbaddr and
523 * lastaddr != lastpbaddr
524 */
525 if (curaddr < lastaddr ||
526 (curaddr == lastaddr && curaddr == curpbaddr &&
527 lastaddr != lastpbaddr)) {
495 change_tmp = change_point[i]; 528 change_tmp = change_point[i];
496 change_point[i] = change_point[i-1]; 529 change_point[i] = change_point[i-1];
497 change_point[i-1] = change_tmp; 530 change_point[i-1] = change_tmp;
498 still_changing=1; 531 still_changing = 1;
499 } 532 }
500 } 533 }
501 } 534 }
502 535
503 /* create a new bios memory map, removing overlaps */ 536 /* create a new bios memory map, removing overlaps */
504 overlap_entries=0; /* number of entries in the overlap table */ 537 overlap_entries = 0; /* number of entries in the overlap table */
505 new_bios_entry=0; /* index for creating new bios map entries */ 538 new_bios_entry = 0; /* index for creating new bios map entries */
506 last_type = 0; /* start with undefined memory type */ 539 last_type = 0; /* start with undefined memory type */
507 last_addr = 0; /* start with 0 as last starting address */ 540 last_addr = 0; /* start with 0 as last starting address */
541
508 /* loop through change-points, determining affect on the new bios map */ 542 /* loop through change-points, determining affect on the new bios map */
509 for (chgidx=0; chgidx < chg_nr; chgidx++) 543 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
510 {
511 /* keep track of all overlapping bios entries */ 544 /* keep track of all overlapping bios entries */
512 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) 545 if (change_point[chgidx]->addr ==
513 { 546 change_point[chgidx]->pbios->addr) {
514 /* add map entry to overlap list (> 1 entry implies an overlap) */ 547 /*
515 overlap_list[overlap_entries++]=change_point[chgidx]->pbios; 548 * add map entry to overlap list (> 1 entry
516 } 549 * implies an overlap)
517 else 550 */
518 { 551 overlap_list[overlap_entries++] =
519 /* remove entry from list (order independent, so swap with last) */ 552 change_point[chgidx]->pbios;
520 for (i=0; i<overlap_entries; i++) 553 } else {
521 { 554 /*
522 if (overlap_list[i] == change_point[chgidx]->pbios) 555 * remove entry from list (order independent,
523 overlap_list[i] = overlap_list[overlap_entries-1]; 556 * so swap with last)
557 */
558 for (i = 0; i < overlap_entries; i++) {
559 if (overlap_list[i] ==
560 change_point[chgidx]->pbios)
561 overlap_list[i] =
562 overlap_list[overlap_entries-1];
524 } 563 }
525 overlap_entries--; 564 overlap_entries--;
526 } 565 }
527 /* if there are overlapping entries, decide which "type" to use */ 566 /*
528 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ 567 * if there are overlapping entries, decide which
568 * "type" to use (larger value takes precedence --
569 * 1=usable, 2,3,4,4+=unusable)
570 */
529 current_type = 0; 571 current_type = 0;
530 for (i=0; i<overlap_entries; i++) 572 for (i = 0; i < overlap_entries; i++)
531 if (overlap_list[i]->type > current_type) 573 if (overlap_list[i]->type > current_type)
532 current_type = overlap_list[i]->type; 574 current_type = overlap_list[i]->type;
533 /* continue building up new bios map based on this information */ 575 /*
576 * continue building up new bios map based on this
577 * information
578 */
534 if (current_type != last_type) { 579 if (current_type != last_type) {
535 if (last_type != 0) { 580 if (last_type != 0) {
536 new_bios[new_bios_entry].size = 581 new_bios[new_bios_entry].size =
537 change_point[chgidx]->addr - last_addr; 582 change_point[chgidx]->addr - last_addr;
538 /* move forward only if the new size was non-zero */ 583 /*
584 * move forward only if the new size
585 * was non-zero
586 */
539 if (new_bios[new_bios_entry].size != 0) 587 if (new_bios[new_bios_entry].size != 0)
588 /*
589 * no more space left for new
590 * bios entries ?
591 */
540 if (++new_bios_entry >= E820MAX) 592 if (++new_bios_entry >= E820MAX)
541 break; /* no more space left for new bios entries */ 593 break;
542 } 594 }
543 if (current_type != 0) { 595 if (current_type != 0) {
544 new_bios[new_bios_entry].addr = change_point[chgidx]->addr; 596 new_bios[new_bios_entry].addr =
597 change_point[chgidx]->addr;
545 new_bios[new_bios_entry].type = current_type; 598 new_bios[new_bios_entry].type = current_type;
546 last_addr=change_point[chgidx]->addr; 599 last_addr = change_point[chgidx]->addr;
547 } 600 }
548 last_type = current_type; 601 last_type = current_type;
549 } 602 }
550 } 603 }
551 new_nr = new_bios_entry; /* retain count for new bios entries */ 604 /* retain count for new bios entries */
605 new_nr = new_bios_entry;
552 606
553 /* copy new bios mapping into original location */ 607 /* copy new bios mapping into original location */
554 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); 608 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
555 *pnr_map = new_nr; 609 *pnr_map = new_nr;
556 610
557 return 0; 611 return 0;
@@ -566,7 +620,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
566 * will have given us a memory map that we can use to properly 620 * will have given us a memory map that we can use to properly
567 * set up memory. If we aren't, we'll fake a memory map. 621 * set up memory. If we aren't, we'll fake a memory map.
568 */ 622 */
569static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) 623static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
570{ 624{
571 /* Only one memory region (or negative)? Ignore it */ 625 /* Only one memory region (or negative)? Ignore it */
572 if (nr_map < 2) 626 if (nr_map < 2)
@@ -583,18 +637,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
583 return -1; 637 return -1;
584 638
585 add_memory_region(start, size, type); 639 add_memory_region(start, size, type);
586 } while (biosmap++,--nr_map); 640 } while (biosmap++, --nr_map);
587 return 0; 641 return 0;
588} 642}
589 643
590void early_panic(char *msg) 644static void early_panic(char *msg)
591{ 645{
592 early_printk(msg); 646 early_printk(msg);
593 panic(msg); 647 panic(msg);
594} 648}
595 649
596void __init setup_memory_region(void) 650/* We're not void only for x86 32-bit compat */
651char * __init machine_specific_memory_setup(void)
597{ 652{
653 char *who = "BIOS-e820";
598 /* 654 /*
599 * Try to copy the BIOS-supplied E820-map. 655 * Try to copy the BIOS-supplied E820-map.
600 * 656 *
@@ -605,7 +661,10 @@ void __init setup_memory_region(void)
605 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) 661 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
606 early_panic("Cannot find a valid memory map"); 662 early_panic("Cannot find a valid memory map");
607 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 663 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
608 e820_print_map("BIOS-e820"); 664 e820_print_map(who);
665
666 /* In case someone cares... */
667 return who;
609} 668}
610 669
611static int __init parse_memopt(char *p) 670static int __init parse_memopt(char *p)
@@ -613,9 +672,9 @@ static int __init parse_memopt(char *p)
613 if (!p) 672 if (!p)
614 return -EINVAL; 673 return -EINVAL;
615 end_user_pfn = memparse(p, &p); 674 end_user_pfn = memparse(p, &p);
616 end_user_pfn >>= PAGE_SHIFT; 675 end_user_pfn >>= PAGE_SHIFT;
617 return 0; 676 return 0;
618} 677}
619early_param("mem", parse_memopt); 678early_param("mem", parse_memopt);
620 679
621static int userdef __initdata; 680static int userdef __initdata;
@@ -627,9 +686,9 @@ static int __init parse_memmap_opt(char *p)
627 686
628 if (!strcmp(p, "exactmap")) { 687 if (!strcmp(p, "exactmap")) {
629#ifdef CONFIG_CRASH_DUMP 688#ifdef CONFIG_CRASH_DUMP
630 /* If we are doing a crash dump, we 689 /*
631 * still need to know the real mem 690 * If we are doing a crash dump, we still need to know
632 * size before original memory map is 691 * the real mem size before original memory map is
633 * reset. 692 * reset.
634 */ 693 */
635 e820_register_active_regions(0, 0, -1UL); 694 e820_register_active_regions(0, 0, -1UL);
@@ -646,6 +705,8 @@ static int __init parse_memmap_opt(char *p)
646 mem_size = memparse(p, &p); 705 mem_size = memparse(p, &p);
647 if (p == oldp) 706 if (p == oldp)
648 return -EINVAL; 707 return -EINVAL;
708
709 userdef = 1;
649 if (*p == '@') { 710 if (*p == '@') {
650 start_at = memparse(p+1, &p); 711 start_at = memparse(p+1, &p);
651 add_memory_region(start_at, mem_size, E820_RAM); 712 add_memory_region(start_at, mem_size, E820_RAM);
@@ -665,11 +726,29 @@ early_param("memmap", parse_memmap_opt);
665void __init finish_e820_parsing(void) 726void __init finish_e820_parsing(void)
666{ 727{
667 if (userdef) { 728 if (userdef) {
729 char nr = e820.nr_map;
730
731 if (sanitize_e820_map(e820.map, &nr) < 0)
732 early_panic("Invalid user supplied memory map");
733 e820.nr_map = nr;
734
668 printk(KERN_INFO "user-defined physical RAM map:\n"); 735 printk(KERN_INFO "user-defined physical RAM map:\n");
669 e820_print_map("user"); 736 e820_print_map("user");
670 } 737 }
671} 738}
672 739
740void __init update_e820(void)
741{
742 u8 nr_map;
743
744 nr_map = e820.nr_map;
745 if (sanitize_e820_map(e820.map, &nr_map))
746 return;
747 e820.nr_map = nr_map;
748 printk(KERN_INFO "modified physical RAM map:\n");
749 e820_print_map("modified");
750}
751
673unsigned long pci_mem_start = 0xaeedbabe; 752unsigned long pci_mem_start = 0xaeedbabe;
674EXPORT_SYMBOL(pci_mem_start); 753EXPORT_SYMBOL(pci_mem_start);
675 754
@@ -713,8 +792,10 @@ __init void e820_setup_gap(void)
713 792
714 if (!found) { 793 if (!found) {
715 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; 794 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
716 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" 795 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
717 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); 796 "address range\n"
797 KERN_ERR "PCI: Unassigned devices with 32bit resource "
798 "registers may break!\n");
718 } 799 }
719 800
720 /* 801 /*
@@ -727,8 +808,9 @@ __init void e820_setup_gap(void)
727 /* Fun with two's complement */ 808 /* Fun with two's complement */
728 pci_mem_start = (gapstart + round) & -round; 809 pci_mem_start = (gapstart + round) & -round;
729 810
730 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 811 printk(KERN_INFO
731 pci_mem_start, gapstart, gapsize); 812 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
813 pci_mem_start, gapstart, gapsize);
732} 814}
733 815
734int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) 816int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 88bb83ec895f..9f51e1ea9e82 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -21,7 +21,33 @@
21#include <asm/gart.h> 21#include <asm/gart.h>
22#endif 22#endif
23 23
24static void __init via_bugs(void) 24static void __init fix_hypertransport_config(int num, int slot, int func)
25{
26 u32 htcfg;
27 /*
28 * we found a hypertransport bus
29 * make sure that we are broadcasting
30 * interrupts to all cpus on the ht bus
31 * if we're using extended apic ids
32 */
33 htcfg = read_pci_config(num, slot, func, 0x68);
34 if (htcfg & (1 << 18)) {
35 printk(KERN_INFO "Detected use of extended apic ids "
36 "on hypertransport bus\n");
37 if ((htcfg & (1 << 17)) == 0) {
38 printk(KERN_INFO "Enabling hypertransport extended "
39 "apic interrupt broadcast\n");
40 printk(KERN_INFO "Note this is a bios bug, "
41 "please contact your hw vendor\n");
42 htcfg |= (1 << 17);
43 write_pci_config(num, slot, func, 0x68, htcfg);
44 }
45 }
46
47
48}
49
50static void __init via_bugs(int num, int slot, int func)
25{ 51{
26#ifdef CONFIG_GART_IOMMU 52#ifdef CONFIG_GART_IOMMU
27 if ((end_pfn > MAX_DMA32_PFN || force_iommu) && 53 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
@@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header)
44#endif /* CONFIG_X86_IO_APIC */ 70#endif /* CONFIG_X86_IO_APIC */
45#endif /* CONFIG_ACPI */ 71#endif /* CONFIG_ACPI */
46 72
47static void __init nvidia_bugs(void) 73static void __init nvidia_bugs(int num, int slot, int func)
48{ 74{
49#ifdef CONFIG_ACPI 75#ifdef CONFIG_ACPI
50#ifdef CONFIG_X86_IO_APIC 76#ifdef CONFIG_X86_IO_APIC
@@ -72,7 +98,7 @@ static void __init nvidia_bugs(void)
72 98
73} 99}
74 100
75static void __init ati_bugs(void) 101static void __init ati_bugs(int num, int slot, int func)
76{ 102{
77#ifdef CONFIG_X86_IO_APIC 103#ifdef CONFIG_X86_IO_APIC
78 if (timer_over_8254 == 1) { 104 if (timer_over_8254 == 1) {
@@ -83,18 +109,67 @@ static void __init ati_bugs(void)
83#endif 109#endif
84} 110}
85 111
112#define QFLAG_APPLY_ONCE 0x1
113#define QFLAG_APPLIED 0x2
114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
86struct chipset { 115struct chipset {
87 u16 vendor; 116 u32 vendor;
88 void (*f)(void); 117 u32 device;
118 u32 class;
119 u32 class_mask;
120 u32 flags;
121 void (*f)(int num, int slot, int func);
89}; 122};
90 123
91static struct chipset early_qrk[] __initdata = { 124static struct chipset early_qrk[] __initdata = {
92 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, 125 { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
93 { PCI_VENDOR_ID_VIA, via_bugs }, 126 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
94 { PCI_VENDOR_ID_ATI, ati_bugs }, 127 { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
130 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
131 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
132 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
95 {} 133 {}
96}; 134};
97 135
136static void __init check_dev_quirk(int num, int slot, int func)
137{
138 u16 class;
139 u16 vendor;
140 u16 device;
141 u8 type;
142 int i;
143
144 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
145
146 if (class == 0xffff)
147 return;
148
149 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
150
151 device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
152
153 for (i = 0; early_qrk[i].f != NULL; i++) {
154 if (((early_qrk[i].vendor == PCI_ANY_ID) ||
155 (early_qrk[i].vendor == vendor)) &&
156 ((early_qrk[i].device == PCI_ANY_ID) ||
157 (early_qrk[i].device == device)) &&
158 (!((early_qrk[i].class ^ class) &
159 early_qrk[i].class_mask))) {
160 if ((early_qrk[i].flags &
161 QFLAG_DONE) != QFLAG_DONE)
162 early_qrk[i].f(num, slot, func);
163 early_qrk[i].flags |= QFLAG_APPLIED;
164 }
165 }
166
167 type = read_pci_config_byte(num, slot, func,
168 PCI_HEADER_TYPE);
169 if (!(type & 0x80))
170 return;
171}
172
98void __init early_quirks(void) 173void __init early_quirks(void)
99{ 174{
100 int num, slot, func; 175 int num, slot, func;
@@ -103,36 +178,8 @@ void __init early_quirks(void)
103 return; 178 return;
104 179
105 /* Poor man's PCI discovery */ 180 /* Poor man's PCI discovery */
106 for (num = 0; num < 32; num++) { 181 for (num = 0; num < 32; num++)
107 for (slot = 0; slot < 32; slot++) { 182 for (slot = 0; slot < 32; slot++)
108 for (func = 0; func < 8; func++) { 183 for (func = 0; func < 8; func++)
109 u32 class; 184 check_dev_quirk(num, slot, func);
110 u32 vendor;
111 u8 type;
112 int i;
113 class = read_pci_config(num,slot,func,
114 PCI_CLASS_REVISION);
115 if (class == 0xffffffff)
116 break;
117
118 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
119 continue;
120
121 vendor = read_pci_config(num, slot, func,
122 PCI_VENDOR_ID);
123 vendor &= 0xffff;
124
125 for (i = 0; early_qrk[i].f; i++)
126 if (early_qrk[i].vendor == vendor) {
127 early_qrk[i].f();
128 return;
129 }
130
131 type = read_pci_config_byte(num, slot, func,
132 PCI_HEADER_TYPE);
133 if (!(type & 0x80))
134 break;
135 }
136 }
137 }
138} 185}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
new file mode 100644
index 000000000000..1411324a625c
--- /dev/null
+++ b/arch/x86/kernel/efi.c
@@ -0,0 +1,512 @@
1/*
2 * Common EFI (Extensible Firmware Interface) support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 1999 VA Linux Systems
6 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
7 * Copyright (C) 1999-2002 Hewlett-Packard Co.
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 * Stephane Eranian <eranian@hpl.hp.com>
10 * Copyright (C) 2005-2008 Intel Co.
11 * Fenghua Yu <fenghua.yu@intel.com>
12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com>
15 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26
18 *
19 * All EFI Runtime Services are not implemented yet as EFI only
20 * supports physical mode addressing on SoftSDV. This is to be fixed
21 * in a future version. --drummond 1999-07-20
22 *
23 * Implemented EFI runtime services and virtual mode calls. --davidm
24 *
25 * Goutham Rao: <goutham.rao@intel.com>
26 * Skip non-WB memory and ignore empty memory ranges.
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/efi.h>
32#include <linux/bootmem.h>
33#include <linux/spinlock.h>
34#include <linux/uaccess.h>
35#include <linux/time.h>
36#include <linux/io.h>
37#include <linux/reboot.h>
38#include <linux/bcd.h>
39
40#include <asm/setup.h>
41#include <asm/efi.h>
42#include <asm/time.h>
43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h>
45
46#define EFI_DEBUG 1
47#define PFX "EFI: "
48
49int efi_enabled;
50EXPORT_SYMBOL(efi_enabled);
51
52struct efi efi;
53EXPORT_SYMBOL(efi);
54
55struct efi_memory_map memmap;
56
57struct efi efi_phys __initdata;
58static efi_system_table_t efi_systab __initdata;
59
60static int __init setup_noefi(char *arg)
61{
62 efi_enabled = 0;
63 return 0;
64}
65early_param("noefi", setup_noefi);
66
67static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
68{
69 return efi_call_virt2(get_time, tm, tc);
70}
71
72static efi_status_t virt_efi_set_time(efi_time_t *tm)
73{
74 return efi_call_virt1(set_time, tm);
75}
76
77static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
78 efi_bool_t *pending,
79 efi_time_t *tm)
80{
81 return efi_call_virt3(get_wakeup_time,
82 enabled, pending, tm);
83}
84
85static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
86{
87 return efi_call_virt2(set_wakeup_time,
88 enabled, tm);
89}
90
91static efi_status_t virt_efi_get_variable(efi_char16_t *name,
92 efi_guid_t *vendor,
93 u32 *attr,
94 unsigned long *data_size,
95 void *data)
96{
97 return efi_call_virt5(get_variable,
98 name, vendor, attr,
99 data_size, data);
100}
101
102static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
103 efi_char16_t *name,
104 efi_guid_t *vendor)
105{
106 return efi_call_virt3(get_next_variable,
107 name_size, name, vendor);
108}
109
110static efi_status_t virt_efi_set_variable(efi_char16_t *name,
111 efi_guid_t *vendor,
112 unsigned long attr,
113 unsigned long data_size,
114 void *data)
115{
116 return efi_call_virt5(set_variable,
117 name, vendor, attr,
118 data_size, data);
119}
120
121static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
122{
123 return efi_call_virt1(get_next_high_mono_count, count);
124}
125
126static void virt_efi_reset_system(int reset_type,
127 efi_status_t status,
128 unsigned long data_size,
129 efi_char16_t *data)
130{
131 efi_call_virt4(reset_system, reset_type, status,
132 data_size, data);
133}
134
135static efi_status_t virt_efi_set_virtual_address_map(
136 unsigned long memory_map_size,
137 unsigned long descriptor_size,
138 u32 descriptor_version,
139 efi_memory_desc_t *virtual_map)
140{
141 return efi_call_virt4(set_virtual_address_map,
142 memory_map_size, descriptor_size,
143 descriptor_version, virtual_map);
144}
145
146static efi_status_t __init phys_efi_set_virtual_address_map(
147 unsigned long memory_map_size,
148 unsigned long descriptor_size,
149 u32 descriptor_version,
150 efi_memory_desc_t *virtual_map)
151{
152 efi_status_t status;
153
154 efi_call_phys_prelog();
155 status = efi_call_phys4(efi_phys.set_virtual_address_map,
156 memory_map_size, descriptor_size,
157 descriptor_version, virtual_map);
158 efi_call_phys_epilog();
159 return status;
160}
161
162static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
163 efi_time_cap_t *tc)
164{
165 efi_status_t status;
166
167 efi_call_phys_prelog();
168 status = efi_call_phys2(efi_phys.get_time, tm, tc);
169 efi_call_phys_epilog();
170 return status;
171}
172
173int efi_set_rtc_mmss(unsigned long nowtime)
174{
175 int real_seconds, real_minutes;
176 efi_status_t status;
177 efi_time_t eft;
178 efi_time_cap_t cap;
179
180 status = efi.get_time(&eft, &cap);
181 if (status != EFI_SUCCESS) {
182 printk(KERN_ERR "Oops: efitime: can't read time!\n");
183 return -1;
184 }
185
186 real_seconds = nowtime % 60;
187 real_minutes = nowtime / 60;
188 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
189 real_minutes += 30;
190 real_minutes %= 60;
191 eft.minute = real_minutes;
192 eft.second = real_seconds;
193
194 status = efi.set_time(&eft);
195 if (status != EFI_SUCCESS) {
196 printk(KERN_ERR "Oops: efitime: can't write time!\n");
197 return -1;
198 }
199 return 0;
200}
201
202unsigned long efi_get_time(void)
203{
204 efi_status_t status;
205 efi_time_t eft;
206 efi_time_cap_t cap;
207
208 status = efi.get_time(&eft, &cap);
209 if (status != EFI_SUCCESS)
210 printk(KERN_ERR "Oops: efitime: can't read time!\n");
211
212 return mktime(eft.year, eft.month, eft.day, eft.hour,
213 eft.minute, eft.second);
214}
215
216#if EFI_DEBUG
217static void __init print_efi_memmap(void)
218{
219 efi_memory_desc_t *md;
220 void *p;
221 int i;
222
223 for (p = memmap.map, i = 0;
224 p < memmap.map_end;
225 p += memmap.desc_size, i++) {
226 md = p;
227 printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
228 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
229 i, md->type, md->attribute, md->phys_addr,
230 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
231 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
232 }
233}
234#endif /* EFI_DEBUG */
235
236void __init efi_init(void)
237{
238 efi_config_table_t *config_tables;
239 efi_runtime_services_t *runtime;
240 efi_char16_t *c16;
241 char vendor[100] = "unknown";
242 int i = 0;
243 void *tmp;
244
245#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
248#else
249 efi_phys.systab = (efi_system_table_t *)
250 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t));
263 if (efi.systab == NULL)
264 printk(KERN_ERR "Couldn't map the EFI system table!\n");
265 memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
266 early_iounmap(efi.systab, sizeof(efi_system_table_t));
267 efi.systab = &efi_systab;
268
269 /*
270 * Verify the EFI Table
271 */
272 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
273 printk(KERN_ERR "EFI system table signature incorrect!\n");
274 if ((efi.systab->hdr.revision >> 16) == 0)
275 printk(KERN_ERR "Warning: EFI system table version "
276 "%d.%02d, expected 1.00 or greater!\n",
277 efi.systab->hdr.revision >> 16,
278 efi.systab->hdr.revision & 0xffff);
279
280 /*
281 * Show what we know for posterity
282 */
283 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
284 if (c16) {
285 for (i = 0; i < sizeof(vendor) && *c16; ++i)
286 vendor[i] = *c16++;
287 vendor[i] = '\0';
288 } else
289 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
290 early_iounmap(tmp, 2);
291
292 printk(KERN_INFO "EFI v%u.%.02u by %s \n",
293 efi.systab->hdr.revision >> 16,
294 efi.systab->hdr.revision & 0xffff, vendor);
295
296 /*
297 * Let's see what config tables the firmware passed to us.
298 */
299 config_tables = early_ioremap(
300 efi.systab->tables,
301 efi.systab->nr_tables * sizeof(efi_config_table_t));
302 if (config_tables == NULL)
303 printk(KERN_ERR "Could not map EFI Configuration Table!\n");
304
305 printk(KERN_INFO);
306 for (i = 0; i < efi.systab->nr_tables; i++) {
307 if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
308 efi.mps = config_tables[i].table;
309 printk(" MPS=0x%lx ", config_tables[i].table);
310 } else if (!efi_guidcmp(config_tables[i].guid,
311 ACPI_20_TABLE_GUID)) {
312 efi.acpi20 = config_tables[i].table;
313 printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
314 } else if (!efi_guidcmp(config_tables[i].guid,
315 ACPI_TABLE_GUID)) {
316 efi.acpi = config_tables[i].table;
317 printk(" ACPI=0x%lx ", config_tables[i].table);
318 } else if (!efi_guidcmp(config_tables[i].guid,
319 SMBIOS_TABLE_GUID)) {
320 efi.smbios = config_tables[i].table;
321 printk(" SMBIOS=0x%lx ", config_tables[i].table);
322 } else if (!efi_guidcmp(config_tables[i].guid,
323 HCDP_TABLE_GUID)) {
324 efi.hcdp = config_tables[i].table;
325 printk(" HCDP=0x%lx ", config_tables[i].table);
326 } else if (!efi_guidcmp(config_tables[i].guid,
327 UGA_IO_PROTOCOL_GUID)) {
328 efi.uga = config_tables[i].table;
329 printk(" UGA=0x%lx ", config_tables[i].table);
330 }
331 }
332 printk("\n");
333 early_iounmap(config_tables,
334 efi.systab->nr_tables * sizeof(efi_config_table_t));
335
336 /*
337 * Check out the runtime services table. We need to map
338 * the runtime services table so that we can grab the physical
339 * address of several of the EFI runtime functions, needed to
340 * set the firmware into virtual mode.
341 */
342 runtime = early_ioremap((unsigned long)efi.systab->runtime,
343 sizeof(efi_runtime_services_t));
344 if (runtime != NULL) {
345 /*
346 * We will only need *early* access to the following
347 * two EFI runtime services before set_virtual_address_map
348 * is invoked.
349 */
350 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
351 efi_phys.set_virtual_address_map =
352 (efi_set_virtual_address_map_t *)
353 runtime->set_virtual_address_map;
354 /*
355 * Make efi_get_time can be called before entering
356 * virtual mode.
357 */
358 efi.get_time = phys_efi_get_time;
359 } else
360 printk(KERN_ERR "Could not map the EFI runtime service "
361 "table!\n");
362 early_iounmap(runtime, sizeof(efi_runtime_services_t));
363
364 /* Map the EFI memory map */
365 memmap.map = early_ioremap((unsigned long)memmap.phys_map,
366 memmap.nr_map * memmap.desc_size);
367 if (memmap.map == NULL)
368 printk(KERN_ERR "Could not map the EFI memory map!\n");
369 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
370 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc"
372 "doesn't match the one from EFI!\n");
373
374 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI;
376
377#if EFI_DEBUG
378 print_efi_memmap();
379#endif
380}
381
382#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
383static void __init runtime_code_page_mkexec(void)
384{
385 efi_memory_desc_t *md;
386 unsigned long end;
387 void *p;
388
389 if (!(__supported_pte_mask & _PAGE_NX))
390 return;
391
392 /* Make EFI runtime service code area executable */
393 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
394 md = p;
395 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
396 if (md->type == EFI_RUNTIME_SERVICES_CODE &&
397 (end >> PAGE_SHIFT) <= max_pfn_mapped) {
398 set_memory_x(md->virt_addr, md->num_pages);
399 set_memory_uc(md->virt_addr, md->num_pages);
400 }
401 }
402 __flush_tlb_all();
403}
404#else
405static inline void __init runtime_code_page_mkexec(void) { }
406#endif
407
408/*
409 * This function will switch the EFI runtime services to virtual mode.
410 * Essentially, look through the EFI memmap and map every region that
411 * has the runtime attribute bit set in its memory descriptor and update
412 * that memory descriptor with the virtual address obtained from ioremap().
413 * This enables the runtime services to be called without having to
414 * thunk back into physical mode for every invocation.
415 */
416void __init efi_enter_virtual_mode(void)
417{
418 efi_memory_desc_t *md;
419 efi_status_t status;
420 unsigned long end;
421 void *p;
422
423 efi.systab = NULL;
424 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
425 md = p;
426 if (!(md->attribute & EFI_MEMORY_RUNTIME))
427 continue;
428 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
429 if ((md->attribute & EFI_MEMORY_WB) &&
430 ((end >> PAGE_SHIFT) <= max_pfn_mapped))
431 md->virt_addr = (unsigned long)__va(md->phys_addr);
432 else
433 md->virt_addr = (unsigned long)
434 efi_ioremap(md->phys_addr,
435 md->num_pages << EFI_PAGE_SHIFT);
436 if (!md->virt_addr)
437 printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
438 (unsigned long long)md->phys_addr);
439 if ((md->phys_addr <= (unsigned long)efi_phys.systab) &&
440 ((unsigned long)efi_phys.systab < end))
441 efi.systab = (efi_system_table_t *)(unsigned long)
442 (md->virt_addr - md->phys_addr +
443 (unsigned long)efi_phys.systab);
444 }
445
446 BUG_ON(!efi.systab);
447
448 status = phys_efi_set_virtual_address_map(
449 memmap.desc_size * memmap.nr_map,
450 memmap.desc_size,
451 memmap.desc_version,
452 memmap.phys_map);
453
454 if (status != EFI_SUCCESS) {
455 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
456 "(status=%lx)!\n", status);
457 panic("EFI call to SetVirtualAddressMap() failed!");
458 }
459
460 /*
461 * Now that EFI is in virtual mode, update the function
462 * pointers in the runtime service table to the new virtual addresses.
463 *
464 * Call EFI services through wrapper functions.
465 */
466 efi.get_time = virt_efi_get_time;
467 efi.set_time = virt_efi_set_time;
468 efi.get_wakeup_time = virt_efi_get_wakeup_time;
469 efi.set_wakeup_time = virt_efi_set_wakeup_time;
470 efi.get_variable = virt_efi_get_variable;
471 efi.get_next_variable = virt_efi_get_next_variable;
472 efi.set_variable = virt_efi_set_variable;
473 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
474 efi.reset_system = virt_efi_reset_system;
475 efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
476 runtime_code_page_mkexec();
477 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
478 memmap.map = NULL;
479}
480
481/*
482 * Convenience functions to obtain memory types and attributes
483 */
484u32 efi_mem_type(unsigned long phys_addr)
485{
486 efi_memory_desc_t *md;
487 void *p;
488
489 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
490 md = p;
491 if ((md->phys_addr <= phys_addr) &&
492 (phys_addr < (md->phys_addr +
493 (md->num_pages << EFI_PAGE_SHIFT))))
494 return md->type;
495 }
496 return 0;
497}
498
499u64 efi_mem_attributes(unsigned long phys_addr)
500{
501 efi_memory_desc_t *md;
502 void *p;
503
504 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
505 md = p;
506 if ((md->phys_addr <= phys_addr) &&
507 (phys_addr < (md->phys_addr +
508 (md->num_pages << EFI_PAGE_SHIFT))))
509 return md->attribute;
510 }
511 return 0;
512}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index e2be78f49399..cb91f985b4a1 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -20,40 +20,15 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/types.h> 23#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/spinlock.h>
28#include <linux/bootmem.h>
29#include <linux/ioport.h> 24#include <linux/ioport.h>
30#include <linux/module.h>
31#include <linux/efi.h> 25#include <linux/efi.h>
32#include <linux/kexec.h>
33 26
34#include <asm/setup.h>
35#include <asm/io.h> 27#include <asm/io.h>
36#include <asm/page.h> 28#include <asm/page.h>
37#include <asm/pgtable.h> 29#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
41 31
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys;
50struct efi_memory_map memmap;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/* 32/*
58 * To make EFI call EFI runtime service in physical addressing mode we need 33 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to 34 * prelog/epilog before/after the invocation to disable interrupt, to
@@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long);
62 */ 37 */
63 38
64static unsigned long efi_rt_eflags; 39static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2]; 40static pgd_t efi_bak_pg_dir_pointer[2];
67 41
68static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) 42void efi_call_phys_prelog(void)
69{ 43{
70 unsigned long cr4; 44 unsigned long cr4;
71 unsigned long temp; 45 unsigned long temp;
72 struct Xgt_desc_struct gdt_descr; 46 struct desc_ptr gdt_descr;
73 47
74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags); 48 local_irq_save(efi_rt_eflags);
76 49
77 /* 50 /*
@@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
101 /* 74 /*
102 * After the lock is released, the original page table is restored. 75 * After the lock is released, the original page table is restored.
103 */ 76 */
104 local_flush_tlb(); 77 __flush_tlb_all();
105 78
106 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 79 gdt_descr.address = __pa(get_cpu_gdt_table(0));
107 gdt_descr.size = GDT_SIZE - 1; 80 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr); 81 load_gdt(&gdt_descr);
109} 82}
110 83
111static void efi_call_phys_epilog(void) __releases(efi_rt_lock) 84void efi_call_phys_epilog(void)
112{ 85{
113 unsigned long cr4; 86 unsigned long cr4;
114 struct Xgt_desc_struct gdt_descr; 87 struct desc_ptr gdt_descr;
115 88
116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); 89 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
117 gdt_descr.size = GDT_SIZE - 1; 90 gdt_descr.size = GDT_SIZE - 1;
@@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
132 /* 105 /*
133 * After the lock is released, the original page table is restored. 106 * After the lock is released, the original page table is restored.
134 */ 107 */
135 local_flush_tlb(); 108 __flush_tlb_all();
136 109
137 local_irq_restore(efi_rt_eflags); 110 local_irq_restore(efi_rt_eflags);
138 spin_unlock(&efi_rt_lock);
139}
140
141static efi_status_t
142phys_efi_set_virtual_address_map(unsigned long memory_map_size,
143 unsigned long descriptor_size,
144 u32 descriptor_version,
145 efi_memory_desc_t *virtual_map)
146{
147 efi_status_t status;
148
149 efi_call_phys_prelog();
150 status = efi_call_phys(efi_phys.set_virtual_address_map,
151 memory_map_size, descriptor_size,
152 descriptor_version, virtual_map);
153 efi_call_phys_epilog();
154 return status;
155}
156
157static efi_status_t
158phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
159{
160 efi_status_t status;
161
162 efi_call_phys_prelog();
163 status = efi_call_phys(efi_phys.get_time, tm, tc);
164 efi_call_phys_epilog();
165 return status;
166}
167
168inline int efi_set_rtc_mmss(unsigned long nowtime)
169{
170 int real_seconds, real_minutes;
171 efi_status_t status;
172 efi_time_t eft;
173 efi_time_cap_t cap;
174
175 spin_lock(&efi_rt_lock);
176 status = efi.get_time(&eft, &cap);
177 spin_unlock(&efi_rt_lock);
178 if (status != EFI_SUCCESS)
179 panic("Ooops, efitime: can't read time!\n");
180 real_seconds = nowtime % 60;
181 real_minutes = nowtime / 60;
182
183 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
184 real_minutes += 30;
185 real_minutes %= 60;
186
187 eft.minute = real_minutes;
188 eft.second = real_seconds;
189
190 if (status != EFI_SUCCESS) {
191 printk("Ooops: efitime: can't read time!\n");
192 return -1;
193 }
194 return 0;
195}
196/*
197 * This is used during kernel init before runtime
198 * services have been remapped and also during suspend, therefore,
199 * we'll need to call both in physical and virtual modes.
200 */
201inline unsigned long efi_get_time(void)
202{
203 efi_status_t status;
204 efi_time_t eft;
205 efi_time_cap_t cap;
206
207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
215 if (status != EFI_SUCCESS)
216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
217
218 return mktime(eft.year, eft.month, eft.day, eft.hour,
219 eft.minute, eft.second);
220}
221
222int is_available_memory(efi_memory_desc_t * md)
223{
224 if (!(md->attribute & EFI_MEMORY_WB))
225 return 0;
226
227 switch (md->type) {
228 case EFI_LOADER_CODE:
229 case EFI_LOADER_DATA:
230 case EFI_BOOT_SERVICES_CODE:
231 case EFI_BOOT_SERVICES_DATA:
232 case EFI_CONVENTIONAL_MEMORY:
233 return 1;
234 }
235 return 0;
236}
237
238/*
239 * We need to map the EFI memory map again after paging_init().
240 */
241void __init efi_map_memmap(void)
242{
243 memmap.map = NULL;
244
245 memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
246 (memmap.nr_map * memmap.desc_size));
247 if (memmap.map == NULL)
248 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
249
250 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
251}
252
253#if EFI_DEBUG
254static void __init print_efi_memmap(void)
255{
256 efi_memory_desc_t *md;
257 void *p;
258 int i;
259
260 for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
261 md = p;
262 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
263 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
264 i, md->type, md->attribute, md->phys_addr,
265 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
266 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
267 }
268}
269#endif /* EFI_DEBUG */
270
271/*
272 * Walks the EFI memory map and calls CALLBACK once for each EFI
273 * memory descriptor that has memory that is available for kernel use.
274 */
275void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
276{
277 int prev_valid = 0;
278 struct range {
279 unsigned long start;
280 unsigned long end;
281 } uninitialized_var(prev), curr;
282 efi_memory_desc_t *md;
283 unsigned long start, end;
284 void *p;
285
286 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
287 md = p;
288
289 if ((md->num_pages == 0) || (!is_available_memory(md)))
290 continue;
291
292 curr.start = md->phys_addr;
293 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
294
295 if (!prev_valid) {
296 prev = curr;
297 prev_valid = 1;
298 } else {
299 if (curr.start < prev.start)
300 printk(KERN_INFO PFX "Unordered memory map\n");
301 if (prev.end == curr.start)
302 prev.end = curr.end;
303 else {
304 start =
305 (unsigned long) (PAGE_ALIGN(prev.start));
306 end = (unsigned long) (prev.end & PAGE_MASK);
307 if ((end > start)
308 && (*callback) (start, end, arg) < 0)
309 return;
310 prev = curr;
311 }
312 }
313 }
314 if (prev_valid) {
315 start = (unsigned long) PAGE_ALIGN(prev.start);
316 end = (unsigned long) (prev.end & PAGE_MASK);
317 if (end > start)
318 (*callback) (start, end, arg);
319 }
320}
321
322void __init efi_init(void)
323{
324 efi_config_table_t *config_tables;
325 efi_runtime_services_t *runtime;
326 efi_char16_t *c16;
327 char vendor[100] = "unknown";
328 unsigned long num_config_tables;
329 int i = 0;
330
331 memset(&efi, 0, sizeof(efi) );
332 memset(&efi_phys, 0, sizeof(efi_phys));
333
334 efi_phys.systab =
335 (efi_system_table_t *)boot_params.efi_info.efi_systab;
336 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
337 memmap.nr_map = boot_params.efi_info.efi_memmap_size/
338 boot_params.efi_info.efi_memdesc_size;
339 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
340 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
341
342 efi.systab = (efi_system_table_t *)
343 boot_ioremap((unsigned long) efi_phys.systab,
344 sizeof(efi_system_table_t));
345 /*
346 * Verify the EFI Table
347 */
348 if (efi.systab == NULL)
349 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
350 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
351 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
352 if ((efi.systab->hdr.revision >> 16) == 0)
353 printk(KERN_ERR PFX "Warning: EFI system table version "
354 "%d.%02d, expected 1.00 or greater\n",
355 efi.systab->hdr.revision >> 16,
356 efi.systab->hdr.revision & 0xffff);
357
358 /*
359 * Grab some details from the system table
360 */
361 num_config_tables = efi.systab->nr_tables;
362 config_tables = (efi_config_table_t *)efi.systab->tables;
363 runtime = efi.systab->runtime;
364
365 /*
366 * Show what we know for posterity
367 */
368 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
369 if (c16) {
370 for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
371 vendor[i] = *c16++;
372 vendor[i] = '\0';
373 } else
374 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
375
376 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
377 efi.systab->hdr.revision >> 16,
378 efi.systab->hdr.revision & 0xffff, vendor);
379
380 /*
381 * Let's see what config tables the firmware passed to us.
382 */
383 config_tables = (efi_config_table_t *)
384 boot_ioremap((unsigned long) config_tables,
385 num_config_tables * sizeof(efi_config_table_t));
386
387 if (config_tables == NULL)
388 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
389
390 efi.mps = EFI_INVALID_TABLE_ADDR;
391 efi.acpi = EFI_INVALID_TABLE_ADDR;
392 efi.acpi20 = EFI_INVALID_TABLE_ADDR;
393 efi.smbios = EFI_INVALID_TABLE_ADDR;
394 efi.sal_systab = EFI_INVALID_TABLE_ADDR;
395 efi.boot_info = EFI_INVALID_TABLE_ADDR;
396 efi.hcdp = EFI_INVALID_TABLE_ADDR;
397 efi.uga = EFI_INVALID_TABLE_ADDR;
398
399 for (i = 0; i < num_config_tables; i++) {
400 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
401 efi.mps = config_tables[i].table;
402 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
403 } else
404 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
405 efi.acpi20 = config_tables[i].table;
406 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
407 } else
408 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
409 efi.acpi = config_tables[i].table;
410 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
411 } else
412 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
413 efi.smbios = config_tables[i].table;
414 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
415 } else
416 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
417 efi.hcdp = config_tables[i].table;
418 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
419 } else
420 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
421 efi.uga = config_tables[i].table;
422 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
423 }
424 }
425 printk("\n");
426
427 /*
428 * Check out the runtime services table. We need to map
429 * the runtime services table so that we can grab the physical
430 * address of several of the EFI runtime functions, needed to
431 * set the firmware into virtual mode.
432 */
433
434 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
435 runtime,
436 sizeof(efi_runtime_services_t));
437 if (runtime != NULL) {
438 /*
439 * We will only need *early* access to the following
440 * two EFI runtime services before set_virtual_address_map
441 * is invoked.
442 */
443 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
444 efi_phys.set_virtual_address_map =
445 (efi_set_virtual_address_map_t *)
446 runtime->set_virtual_address_map;
447 } else
448 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
449
450 /* Map the EFI memory map for use until paging_init() */
451 memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap,
452 boot_params.efi_info.efi_memmap_size);
453 if (memmap.map == NULL)
454 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
455
456 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
457
458#if EFI_DEBUG
459 print_efi_memmap();
460#endif
461}
462
463static inline void __init check_range_for_systab(efi_memory_desc_t *md)
464{
465 if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
466 ((unsigned long)efi_phys.systab < md->phys_addr +
467 ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
468 unsigned long addr;
469
470 addr = md->virt_addr - md->phys_addr +
471 (unsigned long)efi_phys.systab;
472 efi.systab = (efi_system_table_t *)addr;
473 }
474}
475
476/*
477 * Wrap all the virtual calls in a way that forces the parameters on the stack.
478 */
479
480#define efi_call_virt(f, args...) \
481 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
482
483static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
484{
485 return efi_call_virt(get_time, tm, tc);
486}
487
488static efi_status_t virt_efi_set_time (efi_time_t *tm)
489{
490 return efi_call_virt(set_time, tm);
491}
492
493static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
494 efi_bool_t *pending,
495 efi_time_t *tm)
496{
497 return efi_call_virt(get_wakeup_time, enabled, pending, tm);
498}
499
500static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
501 efi_time_t *tm)
502{
503 return efi_call_virt(set_wakeup_time, enabled, tm);
504}
505
506static efi_status_t virt_efi_get_variable (efi_char16_t *name,
507 efi_guid_t *vendor, u32 *attr,
508 unsigned long *data_size, void *data)
509{
510 return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
511}
512
513static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
514 efi_char16_t *name,
515 efi_guid_t *vendor)
516{
517 return efi_call_virt(get_next_variable, name_size, name, vendor);
518}
519
520static efi_status_t virt_efi_set_variable (efi_char16_t *name,
521 efi_guid_t *vendor,
522 unsigned long attr,
523 unsigned long data_size, void *data)
524{
525 return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
526}
527
528static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
529{
530 return efi_call_virt(get_next_high_mono_count, count);
531}
532
533static void virt_efi_reset_system (int reset_type, efi_status_t status,
534 unsigned long data_size,
535 efi_char16_t *data)
536{
537 efi_call_virt(reset_system, reset_type, status, data_size, data);
538}
539
540/*
541 * This function will switch the EFI runtime services to virtual mode.
542 * Essentially, look through the EFI memmap and map every region that
543 * has the runtime attribute bit set in its memory descriptor and update
544 * that memory descriptor with the virtual address obtained from ioremap().
545 * This enables the runtime services to be called without having to
546 * thunk back into physical mode for every invocation.
547 */
548
549void __init efi_enter_virtual_mode(void)
550{
551 efi_memory_desc_t *md;
552 efi_status_t status;
553 void *p;
554
555 efi.systab = NULL;
556
557 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
558 md = p;
559
560 if (!(md->attribute & EFI_MEMORY_RUNTIME))
561 continue;
562
563 md->virt_addr = (unsigned long)ioremap(md->phys_addr,
564 md->num_pages << EFI_PAGE_SHIFT);
565 if (!(unsigned long)md->virt_addr) {
566 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
567 (unsigned long)md->phys_addr);
568 }
569 /* update the virtual address of the EFI system table */
570 check_range_for_systab(md);
571 }
572
573 BUG_ON(!efi.systab);
574
575 status = phys_efi_set_virtual_address_map(
576 memmap.desc_size * memmap.nr_map,
577 memmap.desc_size,
578 memmap.desc_version,
579 memmap.phys_map);
580
581 if (status != EFI_SUCCESS) {
582 printk (KERN_ALERT "You are screwed! "
583 "Unable to switch EFI into virtual mode "
584 "(status=%lx)\n", status);
585 panic("EFI call to SetVirtualAddressMap() failed!");
586 }
587
588 /*
589 * Now that EFI is in virtual mode, update the function
590 * pointers in the runtime service table to the new virtual addresses.
591 */
592
593 efi.get_time = virt_efi_get_time;
594 efi.set_time = virt_efi_set_time;
595 efi.get_wakeup_time = virt_efi_get_wakeup_time;
596 efi.set_wakeup_time = virt_efi_set_wakeup_time;
597 efi.get_variable = virt_efi_get_variable;
598 efi.get_next_variable = virt_efi_get_next_variable;
599 efi.set_variable = virt_efi_set_variable;
600 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
601 efi.reset_system = virt_efi_reset_system;
602}
603
604void __init
605efi_initialize_iomem_resources(struct resource *code_resource,
606 struct resource *data_resource,
607 struct resource *bss_resource)
608{
609 struct resource *res;
610 efi_memory_desc_t *md;
611 void *p;
612
613 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
614 md = p;
615
616 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
617 0x100000000ULL)
618 continue;
619 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
620 switch (md->type) {
621 case EFI_RESERVED_TYPE:
622 res->name = "Reserved Memory";
623 break;
624 case EFI_LOADER_CODE:
625 res->name = "Loader Code";
626 break;
627 case EFI_LOADER_DATA:
628 res->name = "Loader Data";
629 break;
630 case EFI_BOOT_SERVICES_DATA:
631 res->name = "BootServices Data";
632 break;
633 case EFI_BOOT_SERVICES_CODE:
634 res->name = "BootServices Code";
635 break;
636 case EFI_RUNTIME_SERVICES_CODE:
637 res->name = "Runtime Service Code";
638 break;
639 case EFI_RUNTIME_SERVICES_DATA:
640 res->name = "Runtime Service Data";
641 break;
642 case EFI_CONVENTIONAL_MEMORY:
643 res->name = "Conventional Memory";
644 break;
645 case EFI_UNUSABLE_MEMORY:
646 res->name = "Unusable Memory";
647 break;
648 case EFI_ACPI_RECLAIM_MEMORY:
649 res->name = "ACPI Reclaim";
650 break;
651 case EFI_ACPI_MEMORY_NVS:
652 res->name = "ACPI NVS";
653 break;
654 case EFI_MEMORY_MAPPED_IO:
655 res->name = "Memory Mapped IO";
656 break;
657 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
658 res->name = "Memory Mapped IO Port Space";
659 break;
660 default:
661 res->name = "Reserved";
662 break;
663 }
664 res->start = md->phys_addr;
665 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
666 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
667 if (request_resource(&iomem_resource, res) < 0)
668 printk(KERN_ERR PFX "Failed to allocate res %s : "
669 "0x%llx-0x%llx\n", res->name,
670 (unsigned long long)res->start,
671 (unsigned long long)res->end);
672 /*
673 * We don't know which region contains kernel data so we try
674 * it repeatedly and let the resource manager test it.
675 */
676 if (md->type == EFI_CONVENTIONAL_MEMORY) {
677 request_resource(res, code_resource);
678 request_resource(res, data_resource);
679 request_resource(res, bss_resource);
680#ifdef CONFIG_KEXEC
681 request_resource(res, &crashk_res);
682#endif
683 }
684 }
685}
686
687/*
688 * Convenience functions to obtain memory types and attributes
689 */
690
691u32 efi_mem_type(unsigned long phys_addr)
692{
693 efi_memory_desc_t *md;
694 void *p;
695
696 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
697 md = p;
698 if ((md->phys_addr <= phys_addr) && (phys_addr <
699 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
700 return md->type;
701 }
702 return 0;
703}
704
705u64 efi_mem_attributes(unsigned long phys_addr)
706{
707 efi_memory_desc_t *md;
708 void *p;
709
710 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
711 md = p;
712 if ((md->phys_addr <= phys_addr) && (phys_addr <
713 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
714 return md->attribute;
715 }
716 return 0;
717} 111}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
new file mode 100644
index 000000000000..4b73992c1e11
--- /dev/null
+++ b/arch/x86/kernel/efi_64.c
@@ -0,0 +1,134 @@
1/*
2 * x86_64 specific EFI support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 2005-2008 Intel Co.
6 * Fenghua Yu <fenghua.yu@intel.com>
7 * Bibo Mao <bibo.mao@intel.com>
8 * Chandramouli Narayanan <mouli@linux.intel.com>
9 * Huang Ying <ying.huang@intel.com>
10 *
11 * Code to convert EFI to E820 map has been implemented in elilo bootloader
12 * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
13 * is setup appropriately for EFI runtime code.
14 * - mouli 06/14/2007.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/mm.h>
21#include <linux/types.h>
22#include <linux/spinlock.h>
23#include <linux/bootmem.h>
24#include <linux/ioport.h>
25#include <linux/module.h>
26#include <linux/efi.h>
27#include <linux/uaccess.h>
28#include <linux/io.h>
29#include <linux/reboot.h>
30
31#include <asm/setup.h>
32#include <asm/page.h>
33#include <asm/e820.h>
34#include <asm/pgtable.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
37#include <asm/efi.h>
38
39static pgd_t save_pgd __initdata;
40static unsigned long efi_flags __initdata;
41
42static void __init early_mapping_set_exec(unsigned long start,
43 unsigned long end,
44 int executable)
45{
46 pte_t *kpte;
47 int level;
48
49 while (start < end) {
50 kpte = lookup_address((unsigned long)__va(start), &level);
51 BUG_ON(!kpte);
52 if (executable)
53 set_pte(kpte, pte_mkexec(*kpte));
54 else
55 set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
56 __supported_pte_mask));
57 if (level == 4)
58 start = (start + PMD_SIZE) & PMD_MASK;
59 else
60 start = (start + PAGE_SIZE) & PAGE_MASK;
61 }
62}
63
64static void __init early_runtime_code_mapping_set_exec(int executable)
65{
66 efi_memory_desc_t *md;
67 void *p;
68
69 if (!(__supported_pte_mask & _PAGE_NX))
70 return;
71
72 /* Make EFI runtime service code area executable */
73 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
74 md = p;
75 if (md->type == EFI_RUNTIME_SERVICES_CODE) {
76 unsigned long end;
77 end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
78 early_mapping_set_exec(md->phys_addr, end, executable);
79 }
80 }
81}
82
83void __init efi_call_phys_prelog(void)
84{
85 unsigned long vaddress;
86
87 local_irq_save(efi_flags);
88 early_runtime_code_mapping_set_exec(1);
89 vaddress = (unsigned long)__va(0x0UL);
90 save_pgd = *pgd_offset_k(0x0UL);
91 set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
92 __flush_tlb_all();
93}
94
95void __init efi_call_phys_epilog(void)
96{
97 /*
98 * After the lock is released, the original page table is restored.
99 */
100 set_pgd(pgd_offset_k(0x0UL), save_pgd);
101 early_runtime_code_mapping_set_exec(0);
102 __flush_tlb_all();
103 local_irq_restore(efi_flags);
104}
105
106void __init efi_reserve_bootmem(void)
107{
108 reserve_bootmem_generic((unsigned long)memmap.phys_map,
109 memmap.nr_map * memmap.desc_size);
110}
111
112void __iomem * __init efi_ioremap(unsigned long offset,
113 unsigned long size)
114{
115 static unsigned pages_mapped;
116 unsigned long last_addr;
117 unsigned i, pages;
118
119 last_addr = offset + size - 1;
120 offset &= PAGE_MASK;
121 pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT;
122 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
123 return NULL;
124
125 for (i = 0; i < pages; i++) {
126 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
127 offset, PAGE_KERNEL_EXEC_NOCACHE);
128 offset += PAGE_SIZE;
129 pages_mapped++;
130 }
131
132 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
133 (pages_mapped - pages));
134}
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
new file mode 100644
index 000000000000..99b47d48c9f4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -0,0 +1,109 @@
1/*
2 * Function calling ABI conversion from Linux to EFI for x86_64
3 *
4 * Copyright (C) 2007 Intel Corp
5 * Bibo Mao <bibo.mao@intel.com>
6 * Huang Ying <ying.huang@intel.com>
7 */
8
9#include <linux/linkage.h>
10
11#define SAVE_XMM \
12 mov %rsp, %rax; \
13 subq $0x70, %rsp; \
14 and $~0xf, %rsp; \
15 mov %rax, (%rsp); \
16 mov %cr0, %rax; \
17 clts; \
18 mov %rax, 0x8(%rsp); \
19 movaps %xmm0, 0x60(%rsp); \
20 movaps %xmm1, 0x50(%rsp); \
21 movaps %xmm2, 0x40(%rsp); \
22 movaps %xmm3, 0x30(%rsp); \
23 movaps %xmm4, 0x20(%rsp); \
24 movaps %xmm5, 0x10(%rsp)
25
26#define RESTORE_XMM \
27 movaps 0x60(%rsp), %xmm0; \
28 movaps 0x50(%rsp), %xmm1; \
29 movaps 0x40(%rsp), %xmm2; \
30 movaps 0x30(%rsp), %xmm3; \
31 movaps 0x20(%rsp), %xmm4; \
32 movaps 0x10(%rsp), %xmm5; \
33 mov 0x8(%rsp), %rsi; \
34 mov %rsi, %cr0; \
35 mov (%rsp), %rsp
36
37ENTRY(efi_call0)
38 SAVE_XMM
39 subq $32, %rsp
40 call *%rdi
41 addq $32, %rsp
42 RESTORE_XMM
43 ret
44
45ENTRY(efi_call1)
46 SAVE_XMM
47 subq $32, %rsp
48 mov %rsi, %rcx
49 call *%rdi
50 addq $32, %rsp
51 RESTORE_XMM
52 ret
53
54ENTRY(efi_call2)
55 SAVE_XMM
56 subq $32, %rsp
57 mov %rsi, %rcx
58 call *%rdi
59 addq $32, %rsp
60 RESTORE_XMM
61 ret
62
63ENTRY(efi_call3)
64 SAVE_XMM
65 subq $32, %rsp
66 mov %rcx, %r8
67 mov %rsi, %rcx
68 call *%rdi
69 addq $32, %rsp
70 RESTORE_XMM
71 ret
72
73ENTRY(efi_call4)
74 SAVE_XMM
75 subq $32, %rsp
76 mov %r8, %r9
77 mov %rcx, %r8
78 mov %rsi, %rcx
79 call *%rdi
80 addq $32, %rsp
81 RESTORE_XMM
82 ret
83
84ENTRY(efi_call5)
85 SAVE_XMM
86 subq $48, %rsp
87 mov %r9, 32(%rsp)
88 mov %r8, %r9
89 mov %rcx, %r8
90 mov %rsi, %rcx
91 call *%rdi
92 addq $48, %rsp
93 RESTORE_XMM
94 ret
95
96ENTRY(efi_call6)
97 SAVE_XMM
98 mov (%rsp), %rax
99 mov 8(%rax), %rax
100 subq $48, %rsp
101 mov %r9, 32(%rsp)
102 mov %rax, 40(%rsp)
103 mov %r8, %r9
104 mov %rcx, %r8
105 mov %rsi, %rcx
106 call *%rdi
107 addq $48, %rsp
108 RESTORE_XMM
109 ret
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dc7f938e5015..be5c31d04884 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
58 * for paravirtualization. The following will never clobber any registers: 58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret") 59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). 61 * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
62 * 62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -283,12 +283,12 @@ END(resume_kernel)
283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
284 284
285 # sysenter call handler stub 285 # sysenter call handler stub
286ENTRY(sysenter_entry) 286ENTRY(ia32_sysenter_target)
287 CFI_STARTPROC simple 287 CFI_STARTPROC simple
288 CFI_SIGNAL_FRAME 288 CFI_SIGNAL_FRAME
289 CFI_DEF_CFA esp, 0 289 CFI_DEF_CFA esp, 0
290 CFI_REGISTER esp, ebp 290 CFI_REGISTER esp, ebp
291 movl TSS_sysenter_esp0(%esp),%esp 291 movl TSS_sysenter_sp0(%esp),%esp
292sysenter_past_esp: 292sysenter_past_esp:
293 /* 293 /*
294 * No need to follow this irqs on/off section: the syscall 294 * No need to follow this irqs on/off section: the syscall
@@ -351,7 +351,7 @@ sysenter_past_esp:
351 xorl %ebp,%ebp 351 xorl %ebp,%ebp
352 TRACE_IRQS_ON 352 TRACE_IRQS_ON
3531: mov PT_FS(%esp), %fs 3531: mov PT_FS(%esp), %fs
354 ENABLE_INTERRUPTS_SYSEXIT 354 ENABLE_INTERRUPTS_SYSCALL_RET
355 CFI_ENDPROC 355 CFI_ENDPROC
356.pushsection .fixup,"ax" 356.pushsection .fixup,"ax"
3572: movl $0,PT_FS(%esp) 3572: movl $0,PT_FS(%esp)
@@ -360,7 +360,7 @@ sysenter_past_esp:
360 .align 4 360 .align 4
361 .long 1b,2b 361 .long 1b,2b
362.popsection 362.popsection
363ENDPROC(sysenter_entry) 363ENDPROC(ia32_sysenter_target)
364 364
365 # system call handler stub 365 # system call handler stub
366ENTRY(system_call) 366ENTRY(system_call)
@@ -583,7 +583,7 @@ END(syscall_badsys)
583 * Build the entry stubs and pointer table with 583 * Build the entry stubs and pointer table with
584 * some assembler magic. 584 * some assembler magic.
585 */ 585 */
586.data 586.section .rodata,"a"
587ENTRY(interrupt) 587ENTRY(interrupt)
588.text 588.text
589 589
@@ -743,7 +743,7 @@ END(device_not_available)
743 * that sets up the real kernel stack. Check here, since we can't 743 * that sets up the real kernel stack. Check here, since we can't
744 * allow the wrong stack to be used. 744 * allow the wrong stack to be used.
745 * 745 *
746 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have 746 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
747 * already pushed 3 words if it hits on the sysenter instruction: 747 * already pushed 3 words if it hits on the sysenter instruction:
748 * eflags, cs and eip. 748 * eflags, cs and eip.
749 * 749 *
@@ -755,7 +755,7 @@ END(device_not_available)
755 cmpw $__KERNEL_CS,4(%esp); \ 755 cmpw $__KERNEL_CS,4(%esp); \
756 jne ok; \ 756 jne ok; \
757label: \ 757label: \
758 movl TSS_sysenter_esp0+offset(%esp),%esp; \ 758 movl TSS_sysenter_sp0+offset(%esp),%esp; \
759 CFI_DEF_CFA esp, 0; \ 759 CFI_DEF_CFA esp, 0; \
760 CFI_UNDEFINED eip; \ 760 CFI_UNDEFINED eip; \
761 pushfl; \ 761 pushfl; \
@@ -768,7 +768,7 @@ label: \
768 768
769KPROBE_ENTRY(debug) 769KPROBE_ENTRY(debug)
770 RING0_INT_FRAME 770 RING0_INT_FRAME
771 cmpl $sysenter_entry,(%esp) 771 cmpl $ia32_sysenter_target,(%esp)
772 jne debug_stack_correct 772 jne debug_stack_correct
773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
774debug_stack_correct: 774debug_stack_correct:
@@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi)
799 popl %eax 799 popl %eax
800 CFI_ADJUST_CFA_OFFSET -4 800 CFI_ADJUST_CFA_OFFSET -4
801 je nmi_espfix_stack 801 je nmi_espfix_stack
802 cmpl $sysenter_entry,(%esp) 802 cmpl $ia32_sysenter_target,(%esp)
803 je nmi_stack_fixup 803 je nmi_stack_fixup
804 pushl %eax 804 pushl %eax
805 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
@@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi)
812 popl %eax 812 popl %eax
813 CFI_ADJUST_CFA_OFFSET -4 813 CFI_ADJUST_CFA_OFFSET -4
814 jae nmi_stack_correct 814 jae nmi_stack_correct
815 cmpl $sysenter_entry,12(%esp) 815 cmpl $ia32_sysenter_target,12(%esp)
816 je nmi_debug_stack_check 816 je nmi_debug_stack_check
817nmi_stack_correct: 817nmi_stack_correct:
818 /* We have a RING0_INT_FRAME here */ 818 /* We have a RING0_INT_FRAME here */
@@ -882,10 +882,10 @@ ENTRY(native_iret)
882.previous 882.previous
883END(native_iret) 883END(native_iret)
884 884
885ENTRY(native_irq_enable_sysexit) 885ENTRY(native_irq_enable_syscall_ret)
886 sti 886 sti
887 sysexit 887 sysexit
888END(native_irq_enable_sysexit) 888END(native_irq_enable_syscall_ret)
889#endif 889#endif
890 890
891KPROBE_ENTRY(int3) 891KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e70f3881d7e4..bea8474744ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
50#include <asm/hw_irq.h> 50#include <asm/hw_irq.h>
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h>
53 54
54 .code64 55 .code64
55 56
@@ -57,6 +58,13 @@
57#define retint_kernel retint_restore_args 58#define retint_kernel retint_restore_args
58#endif 59#endif
59 60
61#ifdef CONFIG_PARAVIRT
62ENTRY(native_irq_enable_syscall_ret)
63 movq %gs:pda_oldrsp,%rsp
64 swapgs
65 sysretq
66#endif /* CONFIG_PARAVIRT */
67
60 68
61.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 69.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
62#ifdef CONFIG_TRACE_IRQFLAGS 70#ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +224,21 @@ ENTRY(system_call)
216 CFI_DEF_CFA rsp,PDA_STACKOFFSET 224 CFI_DEF_CFA rsp,PDA_STACKOFFSET
217 CFI_REGISTER rip,rcx 225 CFI_REGISTER rip,rcx
218 /*CFI_REGISTER rflags,r11*/ 226 /*CFI_REGISTER rflags,r11*/
219 swapgs 227 SWAPGS_UNSAFE_STACK
228 /*
229 * A hypervisor implementation might want to use a label
230 * after the swapgs, so that it can do the swapgs
231 * for the guest and jump here on syscall.
232 */
233ENTRY(system_call_after_swapgs)
234
220 movq %rsp,%gs:pda_oldrsp 235 movq %rsp,%gs:pda_oldrsp
221 movq %gs:pda_kernelstack,%rsp 236 movq %gs:pda_kernelstack,%rsp
222 /* 237 /*
223 * No need to follow this irqs off/on section - it's straight 238 * No need to follow this irqs off/on section - it's straight
224 * and short: 239 * and short:
225 */ 240 */
226 sti 241 ENABLE_INTERRUPTS(CLBR_NONE)
227 SAVE_ARGS 8,1 242 SAVE_ARGS 8,1
228 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 243 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
229 movq %rcx,RIP-ARGOFFSET(%rsp) 244 movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +261,7 @@ ret_from_sys_call:
246sysret_check: 261sysret_check:
247 LOCKDEP_SYS_EXIT 262 LOCKDEP_SYS_EXIT
248 GET_THREAD_INFO(%rcx) 263 GET_THREAD_INFO(%rcx)
249 cli 264 DISABLE_INTERRUPTS(CLBR_NONE)
250 TRACE_IRQS_OFF 265 TRACE_IRQS_OFF
251 movl threadinfo_flags(%rcx),%edx 266 movl threadinfo_flags(%rcx),%edx
252 andl %edi,%edx 267 andl %edi,%edx
@@ -260,9 +275,7 @@ sysret_check:
260 CFI_REGISTER rip,rcx 275 CFI_REGISTER rip,rcx
261 RESTORE_ARGS 0,-ARG_SKIP,1 276 RESTORE_ARGS 0,-ARG_SKIP,1
262 /*CFI_REGISTER rflags,r11*/ 277 /*CFI_REGISTER rflags,r11*/
263 movq %gs:pda_oldrsp,%rsp 278 ENABLE_INTERRUPTS_SYSCALL_RET
264 swapgs
265 sysretq
266 279
267 CFI_RESTORE_STATE 280 CFI_RESTORE_STATE
268 /* Handle reschedules */ 281 /* Handle reschedules */
@@ -271,7 +284,7 @@ sysret_careful:
271 bt $TIF_NEED_RESCHED,%edx 284 bt $TIF_NEED_RESCHED,%edx
272 jnc sysret_signal 285 jnc sysret_signal
273 TRACE_IRQS_ON 286 TRACE_IRQS_ON
274 sti 287 ENABLE_INTERRUPTS(CLBR_NONE)
275 pushq %rdi 288 pushq %rdi
276 CFI_ADJUST_CFA_OFFSET 8 289 CFI_ADJUST_CFA_OFFSET 8
277 call schedule 290 call schedule
@@ -282,7 +295,7 @@ sysret_careful:
282 /* Handle a signal */ 295 /* Handle a signal */
283sysret_signal: 296sysret_signal:
284 TRACE_IRQS_ON 297 TRACE_IRQS_ON
285 sti 298 ENABLE_INTERRUPTS(CLBR_NONE)
286 testl $_TIF_DO_NOTIFY_MASK,%edx 299 testl $_TIF_DO_NOTIFY_MASK,%edx
287 jz 1f 300 jz 1f
288 301
@@ -295,7 +308,7 @@ sysret_signal:
2951: movl $_TIF_NEED_RESCHED,%edi 3081: movl $_TIF_NEED_RESCHED,%edi
296 /* Use IRET because user could have changed frame. This 309 /* Use IRET because user could have changed frame. This
297 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
298 cli 311 DISABLE_INTERRUPTS(CLBR_NONE)
299 TRACE_IRQS_OFF 312 TRACE_IRQS_OFF
300 jmp int_with_check 313 jmp int_with_check
301 314
@@ -327,7 +340,7 @@ tracesys:
327 */ 340 */
328 .globl int_ret_from_sys_call 341 .globl int_ret_from_sys_call
329int_ret_from_sys_call: 342int_ret_from_sys_call:
330 cli 343 DISABLE_INTERRUPTS(CLBR_NONE)
331 TRACE_IRQS_OFF 344 TRACE_IRQS_OFF
332 testl $3,CS-ARGOFFSET(%rsp) 345 testl $3,CS-ARGOFFSET(%rsp)
333 je retint_restore_args 346 je retint_restore_args
@@ -349,20 +362,20 @@ int_careful:
349 bt $TIF_NEED_RESCHED,%edx 362 bt $TIF_NEED_RESCHED,%edx
350 jnc int_very_careful 363 jnc int_very_careful
351 TRACE_IRQS_ON 364 TRACE_IRQS_ON
352 sti 365 ENABLE_INTERRUPTS(CLBR_NONE)
353 pushq %rdi 366 pushq %rdi
354 CFI_ADJUST_CFA_OFFSET 8 367 CFI_ADJUST_CFA_OFFSET 8
355 call schedule 368 call schedule
356 popq %rdi 369 popq %rdi
357 CFI_ADJUST_CFA_OFFSET -8 370 CFI_ADJUST_CFA_OFFSET -8
358 cli 371 DISABLE_INTERRUPTS(CLBR_NONE)
359 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
360 jmp int_with_check 373 jmp int_with_check
361 374
362 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full stack frame */
363int_very_careful: 376int_very_careful:
364 TRACE_IRQS_ON 377 TRACE_IRQS_ON
365 sti 378 ENABLE_INTERRUPTS(CLBR_NONE)
366 SAVE_REST 379 SAVE_REST
367 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
368 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 381 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -385,7 +398,7 @@ int_signal:
3851: movl $_TIF_NEED_RESCHED,%edi 3981: movl $_TIF_NEED_RESCHED,%edi
386int_restore_rest: 399int_restore_rest:
387 RESTORE_REST 400 RESTORE_REST
388 cli 401 DISABLE_INTERRUPTS(CLBR_NONE)
389 TRACE_IRQS_OFF 402 TRACE_IRQS_OFF
390 jmp int_with_check 403 jmp int_with_check
391 CFI_ENDPROC 404 CFI_ENDPROC
@@ -506,7 +519,7 @@ END(stub_rt_sigreturn)
506 CFI_DEF_CFA_REGISTER rbp 519 CFI_DEF_CFA_REGISTER rbp
507 testl $3,CS(%rdi) 520 testl $3,CS(%rdi)
508 je 1f 521 je 1f
509 swapgs 522 SWAPGS
510 /* irqcount is used to check if a CPU is already on an interrupt 523 /* irqcount is used to check if a CPU is already on an interrupt
511 stack or not. While this is essentially redundant with preempt_count 524 stack or not. While this is essentially redundant with preempt_count
512 it is a little cheaper to use a separate counter in the PDA 525 it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +540,7 @@ ENTRY(common_interrupt)
527 interrupt do_IRQ 540 interrupt do_IRQ
528 /* 0(%rsp): oldrsp-ARGOFFSET */ 541 /* 0(%rsp): oldrsp-ARGOFFSET */
529ret_from_intr: 542ret_from_intr:
530 cli 543 DISABLE_INTERRUPTS(CLBR_NONE)
531 TRACE_IRQS_OFF 544 TRACE_IRQS_OFF
532 decl %gs:pda_irqcount 545 decl %gs:pda_irqcount
533 leaveq 546 leaveq
@@ -556,13 +569,13 @@ retint_swapgs: /* return to user-space */
556 /* 569 /*
557 * The iretq could re-enable interrupts: 570 * The iretq could re-enable interrupts:
558 */ 571 */
559 cli 572 DISABLE_INTERRUPTS(CLBR_ANY)
560 TRACE_IRQS_IRETQ 573 TRACE_IRQS_IRETQ
561 swapgs 574 SWAPGS
562 jmp restore_args 575 jmp restore_args
563 576
564retint_restore_args: /* return to kernel space */ 577retint_restore_args: /* return to kernel space */
565 cli 578 DISABLE_INTERRUPTS(CLBR_ANY)
566 /* 579 /*
567 * The iretq could re-enable interrupts: 580 * The iretq could re-enable interrupts:
568 */ 581 */
@@ -570,10 +583,14 @@ retint_restore_args: /* return to kernel space */
570restore_args: 583restore_args:
571 RESTORE_ARGS 0,8,0 584 RESTORE_ARGS 0,8,0
572iret_label: 585iret_label:
586#ifdef CONFIG_PARAVIRT
587 INTERRUPT_RETURN
588#endif
589ENTRY(native_iret)
573 iretq 590 iretq
574 591
575 .section __ex_table,"a" 592 .section __ex_table,"a"
576 .quad iret_label,bad_iret 593 .quad native_iret, bad_iret
577 .previous 594 .previous
578 .section .fixup,"ax" 595 .section .fixup,"ax"
579 /* force a signal here? this matches i386 behaviour */ 596 /* force a signal here? this matches i386 behaviour */
@@ -581,24 +598,24 @@ iret_label:
581bad_iret: 598bad_iret:
582 movq $11,%rdi /* SIGSEGV */ 599 movq $11,%rdi /* SIGSEGV */
583 TRACE_IRQS_ON 600 TRACE_IRQS_ON
584 sti 601 ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
585 jmp do_exit 602 jmp do_exit
586 .previous 603 .previous
587 604
588 /* edi: workmask, edx: work */ 605 /* edi: workmask, edx: work */
589retint_careful: 606retint_careful:
590 CFI_RESTORE_STATE 607 CFI_RESTORE_STATE
591 bt $TIF_NEED_RESCHED,%edx 608 bt $TIF_NEED_RESCHED,%edx
592 jnc retint_signal 609 jnc retint_signal
593 TRACE_IRQS_ON 610 TRACE_IRQS_ON
594 sti 611 ENABLE_INTERRUPTS(CLBR_NONE)
595 pushq %rdi 612 pushq %rdi
596 CFI_ADJUST_CFA_OFFSET 8 613 CFI_ADJUST_CFA_OFFSET 8
597 call schedule 614 call schedule
598 popq %rdi 615 popq %rdi
599 CFI_ADJUST_CFA_OFFSET -8 616 CFI_ADJUST_CFA_OFFSET -8
600 GET_THREAD_INFO(%rcx) 617 GET_THREAD_INFO(%rcx)
601 cli 618 DISABLE_INTERRUPTS(CLBR_NONE)
602 TRACE_IRQS_OFF 619 TRACE_IRQS_OFF
603 jmp retint_check 620 jmp retint_check
604 621
@@ -606,14 +623,14 @@ retint_signal:
606 testl $_TIF_DO_NOTIFY_MASK,%edx 623 testl $_TIF_DO_NOTIFY_MASK,%edx
607 jz retint_swapgs 624 jz retint_swapgs
608 TRACE_IRQS_ON 625 TRACE_IRQS_ON
609 sti 626 ENABLE_INTERRUPTS(CLBR_NONE)
610 SAVE_REST 627 SAVE_REST
611 movq $-1,ORIG_RAX(%rsp) 628 movq $-1,ORIG_RAX(%rsp)
612 xorl %esi,%esi # oldset 629 xorl %esi,%esi # oldset
613 movq %rsp,%rdi # &pt_regs 630 movq %rsp,%rdi # &pt_regs
614 call do_notify_resume 631 call do_notify_resume
615 RESTORE_REST 632 RESTORE_REST
616 cli 633 DISABLE_INTERRUPTS(CLBR_NONE)
617 TRACE_IRQS_OFF 634 TRACE_IRQS_OFF
618 movl $_TIF_NEED_RESCHED,%edi 635 movl $_TIF_NEED_RESCHED,%edi
619 GET_THREAD_INFO(%rcx) 636 GET_THREAD_INFO(%rcx)
@@ -731,7 +748,7 @@ END(spurious_interrupt)
731 rdmsr 748 rdmsr
732 testl %edx,%edx 749 testl %edx,%edx
733 js 1f 750 js 1f
734 swapgs 751 SWAPGS
735 xorl %ebx,%ebx 752 xorl %ebx,%ebx
7361: 7531:
737 .if \ist 754 .if \ist
@@ -747,7 +764,7 @@ END(spurious_interrupt)
747 .if \ist 764 .if \ist
748 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 765 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
749 .endif 766 .endif
750 cli 767 DISABLE_INTERRUPTS(CLBR_NONE)
751 .if \irqtrace 768 .if \irqtrace
752 TRACE_IRQS_OFF 769 TRACE_IRQS_OFF
753 .endif 770 .endif
@@ -776,10 +793,10 @@ paranoid_swapgs\trace:
776 .if \trace 793 .if \trace
777 TRACE_IRQS_IRETQ 0 794 TRACE_IRQS_IRETQ 0
778 .endif 795 .endif
779 swapgs 796 SWAPGS_UNSAFE_STACK
780paranoid_restore\trace: 797paranoid_restore\trace:
781 RESTORE_ALL 8 798 RESTORE_ALL 8
782 iretq 799 INTERRUPT_RETURN
783paranoid_userspace\trace: 800paranoid_userspace\trace:
784 GET_THREAD_INFO(%rcx) 801 GET_THREAD_INFO(%rcx)
785 movl threadinfo_flags(%rcx),%ebx 802 movl threadinfo_flags(%rcx),%ebx
@@ -794,11 +811,11 @@ paranoid_userspace\trace:
794 .if \trace 811 .if \trace
795 TRACE_IRQS_ON 812 TRACE_IRQS_ON
796 .endif 813 .endif
797 sti 814 ENABLE_INTERRUPTS(CLBR_NONE)
798 xorl %esi,%esi /* arg2: oldset */ 815 xorl %esi,%esi /* arg2: oldset */
799 movq %rsp,%rdi /* arg1: &pt_regs */ 816 movq %rsp,%rdi /* arg1: &pt_regs */
800 call do_notify_resume 817 call do_notify_resume
801 cli 818 DISABLE_INTERRUPTS(CLBR_NONE)
802 .if \trace 819 .if \trace
803 TRACE_IRQS_OFF 820 TRACE_IRQS_OFF
804 .endif 821 .endif
@@ -807,9 +824,9 @@ paranoid_schedule\trace:
807 .if \trace 824 .if \trace
808 TRACE_IRQS_ON 825 TRACE_IRQS_ON
809 .endif 826 .endif
810 sti 827 ENABLE_INTERRUPTS(CLBR_ANY)
811 call schedule 828 call schedule
812 cli 829 DISABLE_INTERRUPTS(CLBR_ANY)
813 .if \trace 830 .if \trace
814 TRACE_IRQS_OFF 831 TRACE_IRQS_OFF
815 .endif 832 .endif
@@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry)
862 testl $3,CS(%rsp) 879 testl $3,CS(%rsp)
863 je error_kernelspace 880 je error_kernelspace
864error_swapgs: 881error_swapgs:
865 swapgs 882 SWAPGS
866error_sti: 883error_sti:
867 movq %rdi,RDI(%rsp) 884 movq %rdi,RDI(%rsp)
868 CFI_REL_OFFSET rdi,RDI 885 CFI_REL_OFFSET rdi,RDI
@@ -874,7 +891,7 @@ error_sti:
874error_exit: 891error_exit:
875 movl %ebx,%eax 892 movl %ebx,%eax
876 RESTORE_REST 893 RESTORE_REST
877 cli 894 DISABLE_INTERRUPTS(CLBR_NONE)
878 TRACE_IRQS_OFF 895 TRACE_IRQS_OFF
879 GET_THREAD_INFO(%rcx) 896 GET_THREAD_INFO(%rcx)
880 testl %eax,%eax 897 testl %eax,%eax
@@ -911,12 +928,12 @@ ENTRY(load_gs_index)
911 CFI_STARTPROC 928 CFI_STARTPROC
912 pushf 929 pushf
913 CFI_ADJUST_CFA_OFFSET 8 930 CFI_ADJUST_CFA_OFFSET 8
914 cli 931 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
915 swapgs 932 SWAPGS
916gs_change: 933gs_change:
917 movl %edi,%gs 934 movl %edi,%gs
9182: mfence /* workaround */ 9352: mfence /* workaround */
919 swapgs 936 SWAPGS
920 popf 937 popf
921 CFI_ADJUST_CFA_OFFSET -8 938 CFI_ADJUST_CFA_OFFSET -8
922 ret 939 ret
@@ -930,7 +947,7 @@ ENDPROC(load_gs_index)
930 .section .fixup,"ax" 947 .section .fixup,"ax"
931 /* running with kernelgs */ 948 /* running with kernelgs */
932bad_gs: 949bad_gs:
933 swapgs /* switch back to user gs */ 950 SWAPGS /* switch back to user gs */
934 xorl %eax,%eax 951 xorl %eax,%eax
935 movl %eax,%gs 952 movl %eax,%gs
936 jmp 2b 953 jmp 2b
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ce703e21c912..4ae7b6440260 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,18 +24,11 @@
24#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
25#endif 25#endif
26 26
27/* 27/* which logical CPU number maps to which CPU (physical APIC ID) */
28 * which logical CPU number maps to which CPU (physical APIC ID) 28u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
29 *
30 * The following static array is used during kernel startup
31 * and the x86_cpu_to_apicid_ptr contains the address of the
32 * array during this time. Is it zeroed when the per_cpu
33 * data area is removed.
34 */
35u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
36 = { [0 ... NR_CPUS-1] = BAD_APICID }; 29 = { [0 ... NR_CPUS-1] = BAD_APICID };
37void *x86_cpu_to_apicid_ptr; 30void *x86_cpu_to_apicid_early_ptr;
38DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 31DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 32EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
40 33
41struct genapic __read_mostly *genapic = &apic_flat; 34struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index f12d8c5d9809..9c7f7d395968 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * AMD Geode southbridge support code 2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc. 3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License 7 * modify it under the terms of version 2 of the GNU General Public License
@@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base);
51 52
52/* === GPIO API === */ 53/* === GPIO API === */
53 54
54void geode_gpio_set(unsigned int gpio, unsigned int reg) 55void geode_gpio_set(u32 gpio, unsigned int reg)
55{ 56{
56 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
57 58
58 if (!base) 59 if (!base)
59 return; 60 return;
60 61
61 if (gpio < 16) 62 /* low bank register */
62 outl(1 << gpio, base + reg); 63 if (gpio & 0xFFFF)
63 else 64 outl(gpio & 0xFFFF, base + reg);
64 outl(1 << (gpio - 16), base + 0x80 + reg); 65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
65} 69}
66EXPORT_SYMBOL_GPL(geode_gpio_set); 70EXPORT_SYMBOL_GPL(geode_gpio_set);
67 71
68void geode_gpio_clear(unsigned int gpio, unsigned int reg) 72void geode_gpio_clear(u32 gpio, unsigned int reg)
69{ 73{
70 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
71 75
72 if (!base) 76 if (!base)
73 return; 77 return;
74 78
75 if (gpio < 16) 79 /* low bank register */
76 outl(1 << (gpio + 16), base + reg); 80 if (gpio & 0xFFFF)
77 else 81 outl((gpio & 0xFFFF) << 16, base + reg);
78 outl(1 << gpio, base + 0x80 + reg); 82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
79} 86}
80EXPORT_SYMBOL_GPL(geode_gpio_clear); 87EXPORT_SYMBOL_GPL(geode_gpio_clear);
81 88
82int geode_gpio_isset(unsigned int gpio, unsigned int reg) 89int geode_gpio_isset(u32 gpio, unsigned int reg)
83{ 90{
84 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
85 93
86 if (!base) 94 if (!base)
87 return 0; 95 return 0;
88 96
89 if (gpio < 16) 97 /* low bank register */
90 return (inl(base + reg) & (1 << gpio)) ? 1 : 0; 98 if (gpio & 0xFFFF) {
91 else 99 val = inl(base + reg) & (gpio & 0xFFFF);
92 return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; 100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
93} 111}
94EXPORT_SYMBOL_GPL(geode_gpio_isset); 112EXPORT_SYMBOL_GPL(geode_gpio_isset);
95 113
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6b3469311e42..a317336cdeaa 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h>
13 14
14#include <asm/processor.h> 15#include <asm/processor.h>
15#include <asm/proto.h> 16#include <asm/proto.h>
@@ -19,12 +20,14 @@
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
23#include <asm/kdebug.h>
24#include <asm/e820.h>
22 25
23static void __init zap_identity_mappings(void) 26static void __init zap_identity_mappings(void)
24{ 27{
25 pgd_t *pgd = pgd_offset_k(0UL); 28 pgd_t *pgd = pgd_offset_k(0UL);
26 pgd_clear(pgd); 29 pgd_clear(pgd);
27 __flush_tlb(); 30 __flush_tlb_all();
28} 31}
29 32
30/* Don't add a printk in there. printk relies on the PDA which is not initialized 33/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data)
46 } 49 }
47} 50}
48 51
52#define EBDA_ADDR_POINTER 0x40E
53
54static __init void reserve_ebda(void)
55{
56 unsigned ebda_addr, ebda_size;
57
58 /*
59 * there is a real-mode segmented pointer pointing to the
60 * 4K EBDA area at 0x40E
61 */
62 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
63 ebda_addr <<= 4;
64
65 if (!ebda_addr)
66 return;
67
68 ebda_size = *(unsigned short *)__va(ebda_addr);
69
70 /* Round EBDA up to pages */
71 if (ebda_size == 0)
72 ebda_size = 1;
73 ebda_size <<= 10;
74 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
75 if (ebda_size > 64*1024)
76 ebda_size = 64*1024;
77
78 reserve_early(ebda_addr, ebda_addr + ebda_size);
79}
80
49void __init x86_64_start_kernel(char * real_mode_data) 81void __init x86_64_start_kernel(char * real_mode_data)
50{ 82{
51 int i; 83 int i;
@@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
56 /* Make NULL pointers segfault */ 88 /* Make NULL pointers segfault */
57 zap_identity_mappings(); 89 zap_identity_mappings();
58 90
59 for (i = 0; i < IDT_ENTRIES; i++) 91 for (i = 0; i < IDT_ENTRIES; i++) {
92#ifdef CONFIG_EARLY_PRINTK
93 set_intr_gate(i, &early_idt_handlers[i]);
94#else
60 set_intr_gate(i, early_idt_handler); 95 set_intr_gate(i, early_idt_handler);
96#endif
97 }
61 load_idt((const struct desc_ptr *)&idt_descr); 98 load_idt((const struct desc_ptr *)&idt_descr);
62 99
63 early_printk("Kernel alive\n"); 100 early_printk("Kernel alive\n");
@@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data)
67 104
68 pda_init(0); 105 pda_init(0);
69 copy_bootdata(__va(real_mode_data)); 106 copy_bootdata(__va(real_mode_data));
70#ifdef CONFIG_SMP 107
71 cpu_set(0, cpu_online_map); 108 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end));
72#endif 109
110 /* Reserve INITRD */
111 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
112 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
113 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
114 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
115 reserve_early(ramdisk_image, ramdisk_end);
116 }
117
118 reserve_ebda();
119
120 /*
121 * At this point everything still needed from the boot loader
122 * or BIOS or kernel text should be early reserved or marked not
123 * RAM in e820. All other memory is free game.
124 */
125
73 start_kernel(); 126 start_kernel();
74} 127}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fbad51fce672..5d8c5730686b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -9,6 +9,7 @@
9 9
10.text 10.text
11#include <linux/threads.h> 11#include <linux/threads.h>
12#include <linux/init.h>
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <asm/segment.h> 14#include <asm/segment.h>
14#include <asm/page.h> 15#include <asm/page.h>
@@ -151,7 +152,9 @@ WEAK(xen_entry)
151 /* Unknown implementation; there's really 152 /* Unknown implementation; there's really
152 nothing we can do at this point. */ 153 nothing we can do at this point. */
153 ud2a 154 ud2a
154.data 155
156 __INITDATA
157
155subarch_entries: 158subarch_entries:
156 .long default_entry /* normal x86/PC */ 159 .long default_entry /* normal x86/PC */
157 .long lguest_entry /* lguest hypervisor */ 160 .long lguest_entry /* lguest hypervisor */
@@ -199,7 +202,6 @@ default_entry:
199 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ 202 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */
200 movl %eax, 4092(%edx) 203 movl %eax, 4092(%edx)
201 204
202 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
203 jmp 3f 205 jmp 3f
204/* 206/*
205 * Non-boot CPU entry point; entered from trampoline.S 207 * Non-boot CPU entry point; entered from trampoline.S
@@ -222,6 +224,8 @@ ENTRY(startup_32_smp)
222 movl %eax,%es 224 movl %eax,%es
223 movl %eax,%fs 225 movl %eax,%fs
224 movl %eax,%gs 226 movl %eax,%gs
227#endif /* CONFIG_SMP */
2283:
225 229
226/* 230/*
227 * New page tables may be in 4Mbyte page mode and may 231 * New page tables may be in 4Mbyte page mode and may
@@ -268,12 +272,6 @@ ENTRY(startup_32_smp)
268 wrmsr 272 wrmsr
269 273
2706: 2746:
271 /* This is a secondary processor (AP) */
272 xorl %ebx,%ebx
273 incl %ebx
274
275#endif /* CONFIG_SMP */
2763:
277 275
278/* 276/*
279 * Enable paging 277 * Enable paging
@@ -297,7 +295,7 @@ ENTRY(startup_32_smp)
297 popfl 295 popfl
298 296
299#ifdef CONFIG_SMP 297#ifdef CONFIG_SMP
300 andl %ebx,%ebx 298 cmpb $0, ready
301 jz 1f /* Initial CPU cleans BSS */ 299 jz 1f /* Initial CPU cleans BSS */
302 jmp checkCPUtype 300 jmp checkCPUtype
3031: 3011:
@@ -502,6 +500,7 @@ early_fault:
502 call printk 500 call printk
503#endif 501#endif
504#endif 502#endif
503 call dump_stack
505hlt_loop: 504hlt_loop:
506 hlt 505 hlt
507 jmp hlt_loop 506 jmp hlt_loop
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b6167fe3330e..1d5a7a361200 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21 21
22#ifdef CONFIG_PARAVIRT
23#include <asm/asm-offsets.h>
24#include <asm/paravirt.h>
25#else
26#define GET_CR2_INTO_RCX movq %cr2, %rcx
27#endif
28
22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 29/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
23 * because we need identity-mapped pages. 30 * because we need identity-mapped pages.
24 * 31 *
@@ -260,14 +267,43 @@ init_rsp:
260bad_address: 267bad_address:
261 jmp bad_address 268 jmp bad_address
262 269
270#ifdef CONFIG_EARLY_PRINTK
271.macro early_idt_tramp first, last
272 .ifgt \last-\first
273 early_idt_tramp \first, \last-1
274 .endif
275 movl $\last,%esi
276 jmp early_idt_handler
277.endm
278
279 .globl early_idt_handlers
280early_idt_handlers:
281 early_idt_tramp 0, 63
282 early_idt_tramp 64, 127
283 early_idt_tramp 128, 191
284 early_idt_tramp 192, 255
285#endif
286
263ENTRY(early_idt_handler) 287ENTRY(early_idt_handler)
288#ifdef CONFIG_EARLY_PRINTK
264 cmpl $2,early_recursion_flag(%rip) 289 cmpl $2,early_recursion_flag(%rip)
265 jz 1f 290 jz 1f
266 incl early_recursion_flag(%rip) 291 incl early_recursion_flag(%rip)
292 GET_CR2_INTO_RCX
293 movq %rcx,%r9
294 xorl %r8d,%r8d # zero for error code
295 movl %esi,%ecx # get vector number
296 # Test %ecx against mask of vectors that push error code.
297 cmpl $31,%ecx
298 ja 0f
299 movl $1,%eax
300 salq %cl,%rax
301 testl $0x27d00,%eax
302 je 0f
303 popq %r8 # get error code
3040: movq 0(%rsp),%rcx # get ip
305 movq 8(%rsp),%rdx # get cs
267 xorl %eax,%eax 306 xorl %eax,%eax
268 movq 8(%rsp),%rsi # get rip
269 movq (%rsp),%rdx
270 movq %cr2,%rcx
271 leaq early_idt_msg(%rip),%rdi 307 leaq early_idt_msg(%rip),%rdi
272 call early_printk 308 call early_printk
273 cmpl $2,early_recursion_flag(%rip) 309 cmpl $2,early_recursion_flag(%rip)
@@ -278,15 +314,19 @@ ENTRY(early_idt_handler)
278 movq 8(%rsp),%rsi # get rip again 314 movq 8(%rsp),%rsi # get rip again
279 call __print_symbol 315 call __print_symbol
280#endif 316#endif
317#endif /* EARLY_PRINTK */
2811: hlt 3181: hlt
282 jmp 1b 319 jmp 1b
320
321#ifdef CONFIG_EARLY_PRINTK
283early_recursion_flag: 322early_recursion_flag:
284 .long 0 323 .long 0
285 324
286early_idt_msg: 325early_idt_msg:
287 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" 326 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
288early_idt_ripmsg: 327early_idt_ripmsg:
289 .asciz "RIP %s\n" 328 .asciz "RIP %s\n"
329#endif /* CONFIG_EARLY_PRINTK */
290 330
291.balign PAGE_SIZE 331.balign PAGE_SIZE
292 332
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 2f99ee206b95..429d084e014d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -6,7 +6,6 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/sysdev.h> 7#include <linux/sysdev.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/delay.h>
10 9
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/hpet.h> 11#include <asm/hpet.h>
@@ -16,7 +15,8 @@
16#define HPET_MASK CLOCKSOURCE_MASK(32) 15#define HPET_MASK CLOCKSOURCE_MASK(32)
17#define HPET_SHIFT 22 16#define HPET_SHIFT 22
18 17
19/* FSEC = 10^-15 NSEC = 10^-9 */ 18/* FSEC = 10^-15
19 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000 20#define FSEC_PER_NSEC 1000000
21 21
22/* 22/*
@@ -107,6 +107,7 @@ int is_hpet_enabled(void)
107{ 107{
108 return is_hpet_capable() && hpet_legacy_int_enabled; 108 return is_hpet_capable() && hpet_legacy_int_enabled;
109} 109}
110EXPORT_SYMBOL_GPL(is_hpet_enabled);
110 111
111/* 112/*
112 * When the hpet driver (/dev/hpet) is enabled, we need to reserve 113 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
@@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id)
132#ifdef CONFIG_HPET_EMULATE_RTC 133#ifdef CONFIG_HPET_EMULATE_RTC
133 hpet_reserve_timer(&hd, 1); 134 hpet_reserve_timer(&hd, 1);
134#endif 135#endif
135
136 hd.hd_irq[0] = HPET_LEGACY_8254; 136 hd.hd_irq[0] = HPET_LEGACY_8254;
137 hd.hd_irq[1] = HPET_LEGACY_RTC; 137 hd.hd_irq[1] = HPET_LEGACY_RTC;
138 138
139 for (i = 2; i < nrtimers; timer++, i++) 139 for (i = 2; i < nrtimers; timer++, i++)
140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> 140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
141 Tn_INT_ROUTE_CNF_SHIFT; 141 Tn_INT_ROUTE_CNF_SHIFT;
142
143 hpet_alloc(&hd); 142 hpet_alloc(&hd);
144
145} 143}
146#else 144#else
147static void hpet_reserve_platform_timers(unsigned long id) { } 145static void hpet_reserve_platform_timers(unsigned long id) { }
@@ -478,6 +476,7 @@ void hpet_disable(void)
478 */ 476 */
479#include <linux/mc146818rtc.h> 477#include <linux/mc146818rtc.h>
480#include <linux/rtc.h> 478#include <linux/rtc.h>
479#include <asm/rtc.h>
481 480
482#define DEFAULT_RTC_INT_FREQ 64 481#define DEFAULT_RTC_INT_FREQ 64
483#define DEFAULT_RTC_SHIFT 6 482#define DEFAULT_RTC_SHIFT 6
@@ -492,6 +491,38 @@ static unsigned long hpet_default_delta;
492static unsigned long hpet_pie_delta; 491static unsigned long hpet_pie_delta;
493static unsigned long hpet_pie_limit; 492static unsigned long hpet_pie_limit;
494 493
494static rtc_irq_handler irq_handler;
495
496/*
497 * Registers a IRQ handler.
498 */
499int hpet_register_irq_handler(rtc_irq_handler handler)
500{
501 if (!is_hpet_enabled())
502 return -ENODEV;
503 if (irq_handler)
504 return -EBUSY;
505
506 irq_handler = handler;
507
508 return 0;
509}
510EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
511
512/*
513 * Deregisters the IRQ handler registered with hpet_register_irq_handler()
514 * and does cleanup.
515 */
516void hpet_unregister_irq_handler(rtc_irq_handler handler)
517{
518 if (!is_hpet_enabled())
519 return;
520
521 irq_handler = NULL;
522 hpet_rtc_flags = 0;
523}
524EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
525
495/* 526/*
496 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode 527 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
497 * is not supported by all HPET implementations for timer 1. 528 * is not supported by all HPET implementations for timer 1.
@@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void)
533 564
534 return 1; 565 return 1;
535} 566}
567EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
536 568
537/* 569/*
538 * The functions below are called from rtc driver. 570 * The functions below are called from rtc driver.
@@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
547 hpet_rtc_flags &= ~bit_mask; 579 hpet_rtc_flags &= ~bit_mask;
548 return 1; 580 return 1;
549} 581}
582EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
550 583
551int hpet_set_rtc_irq_bit(unsigned long bit_mask) 584int hpet_set_rtc_irq_bit(unsigned long bit_mask)
552{ 585{
@@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
562 595
563 return 1; 596 return 1;
564} 597}
598EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
565 599
566int hpet_set_alarm_time(unsigned char hrs, unsigned char min, 600int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
567 unsigned char sec) 601 unsigned char sec)
@@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
575 609
576 return 1; 610 return 1;
577} 611}
612EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
578 613
579int hpet_set_periodic_freq(unsigned long freq) 614int hpet_set_periodic_freq(unsigned long freq)
580{ 615{
@@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq)
593 } 628 }
594 return 1; 629 return 1;
595} 630}
631EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
596 632
597int hpet_rtc_dropped_irq(void) 633int hpet_rtc_dropped_irq(void)
598{ 634{
599 return is_hpet_enabled(); 635 return is_hpet_enabled();
600} 636}
637EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
601 638
602static void hpet_rtc_timer_reinit(void) 639static void hpet_rtc_timer_reinit(void)
603{ 640{
@@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
641 unsigned long rtc_int_flag = 0; 678 unsigned long rtc_int_flag = 0;
642 679
643 hpet_rtc_timer_reinit(); 680 hpet_rtc_timer_reinit();
681 memset(&curr_time, 0, sizeof(struct rtc_time));
644 682
645 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) 683 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
646 rtc_get_rtc_time(&curr_time); 684 get_rtc_time(&curr_time);
647 685
648 if (hpet_rtc_flags & RTC_UIE && 686 if (hpet_rtc_flags & RTC_UIE &&
649 curr_time.tm_sec != hpet_prev_update_sec) { 687 curr_time.tm_sec != hpet_prev_update_sec) {
@@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
665 703
666 if (rtc_int_flag) { 704 if (rtc_int_flag) {
667 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); 705 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
668 rtc_interrupt(rtc_int_flag, dev_id); 706 if (irq_handler)
707 irq_handler(rtc_int_flag, dev_id);
669 } 708 }
670 return IRQ_HANDLED; 709 return IRQ_HANDLED;
671} 710}
711EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
672#endif 712#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 02112fcc0de7..061627806a2d 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8);
22 22
23EXPORT_SYMBOL(strstr); 23EXPORT_SYMBOL(strstr);
24 24
25#ifdef CONFIG_SMP
26extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
27extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
28EXPORT_SYMBOL(__write_lock_failed);
29EXPORT_SYMBOL(__read_lock_failed);
30#endif
31
32EXPORT_SYMBOL(csum_partial); 25EXPORT_SYMBOL(csum_partial);
33EXPORT_SYMBOL(empty_zero_page); 26EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
new file mode 100644
index 000000000000..26719bd2c77c
--- /dev/null
+++ b/arch/x86/kernel/i387.c
@@ -0,0 +1,479 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <linux/regset.h>
12#include <asm/processor.h>
13#include <asm/i387.h>
14#include <asm/math_emu.h>
15#include <asm/sigcontext.h>
16#include <asm/user.h>
17#include <asm/ptrace.h>
18#include <asm/uaccess.h>
19
20#ifdef CONFIG_X86_64
21
22#include <asm/sigcontext32.h>
23#include <asm/user32.h>
24
25#else
26
27#define save_i387_ia32 save_i387
28#define restore_i387_ia32 restore_i387
29
30#define _fpstate_ia32 _fpstate
31#define user_i387_ia32_struct user_i387_struct
32#define user32_fxsr_struct user_fxsr_struct
33
34#endif
35
36#ifdef CONFIG_MATH_EMULATION
37#define HAVE_HWFP (boot_cpu_data.hard_math)
38#else
39#define HAVE_HWFP 1
40#endif
41
42unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
43
44void mxcsr_feature_mask_init(void)
45{
46 unsigned long mask = 0;
47 clts();
48 if (cpu_has_fxsr) {
49 memset(&current->thread.i387.fxsave, 0,
50 sizeof(struct i387_fxsave_struct));
51 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
52 mask = current->thread.i387.fxsave.mxcsr_mask;
53 if (mask == 0)
54 mask = 0x0000ffbf;
55 }
56 mxcsr_feature_mask &= mask;
57 stts();
58}
59
60#ifdef CONFIG_X86_64
61/*
62 * Called at bootup to set up the initial FPU state that is later cloned
63 * into all processes.
64 */
65void __cpuinit fpu_init(void)
66{
67 unsigned long oldcr0 = read_cr0();
68 extern void __bad_fxsave_alignment(void);
69
70 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
71 __bad_fxsave_alignment();
72 set_in_cr4(X86_CR4_OSFXSR);
73 set_in_cr4(X86_CR4_OSXMMEXCPT);
74
75 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
76
77 mxcsr_feature_mask_init();
78 /* clean state in init */
79 current_thread_info()->status = 0;
80 clear_used_math();
81}
82#endif /* CONFIG_X86_64 */
83
84/*
85 * The _current_ task is using the FPU for the first time
86 * so initialize it and set the mxcsr to its default
87 * value at reset if we support XMM instructions and then
88 * remeber the current task has used the FPU.
89 */
90void init_fpu(struct task_struct *tsk)
91{
92 if (tsk_used_math(tsk)) {
93 if (tsk == current)
94 unlazy_fpu(tsk);
95 return;
96 }
97
98 if (cpu_has_fxsr) {
99 memset(&tsk->thread.i387.fxsave, 0,
100 sizeof(struct i387_fxsave_struct));
101 tsk->thread.i387.fxsave.cwd = 0x37f;
102 if (cpu_has_xmm)
103 tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
104 } else {
105 memset(&tsk->thread.i387.fsave, 0,
106 sizeof(struct i387_fsave_struct));
107 tsk->thread.i387.fsave.cwd = 0xffff037fu;
108 tsk->thread.i387.fsave.swd = 0xffff0000u;
109 tsk->thread.i387.fsave.twd = 0xffffffffu;
110 tsk->thread.i387.fsave.fos = 0xffff0000u;
111 }
112 /*
113 * Only the device not available exception or ptrace can call init_fpu.
114 */
115 set_stopped_child_used_math(tsk);
116}
117
118int fpregs_active(struct task_struct *target, const struct user_regset *regset)
119{
120 return tsk_used_math(target) ? regset->n : 0;
121}
122
123int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
124{
125 return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
126}
127
128int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
129 unsigned int pos, unsigned int count,
130 void *kbuf, void __user *ubuf)
131{
132 if (!cpu_has_fxsr)
133 return -ENODEV;
134
135 unlazy_fpu(target);
136
137 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
138 &target->thread.i387.fxsave, 0, -1);
139}
140
141int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
142 unsigned int pos, unsigned int count,
143 const void *kbuf, const void __user *ubuf)
144{
145 int ret;
146
147 if (!cpu_has_fxsr)
148 return -ENODEV;
149
150 unlazy_fpu(target);
151 set_stopped_child_used_math(target);
152
153 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
154 &target->thread.i387.fxsave, 0, -1);
155
156 /*
157 * mxcsr reserved bits must be masked to zero for security reasons.
158 */
159 target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
160
161 return ret;
162}
163
164#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
165
166/*
167 * FPU tag word conversions.
168 */
169
170static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
171{
172 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
173
174 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
175 tmp = ~twd;
176 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
177 /* and move the valid bits to the lower byte. */
178 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
179 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
180 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
181 return tmp;
182}
183
184#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
185#define FP_EXP_TAG_VALID 0
186#define FP_EXP_TAG_ZERO 1
187#define FP_EXP_TAG_SPECIAL 2
188#define FP_EXP_TAG_EMPTY 3
189
190static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
191{
192 struct _fpxreg *st;
193 u32 tos = (fxsave->swd >> 11) & 7;
194 u32 twd = (unsigned long) fxsave->twd;
195 u32 tag;
196 u32 ret = 0xffff0000u;
197 int i;
198
199 for (i = 0; i < 8; i++, twd >>= 1) {
200 if (twd & 0x1) {
201 st = FPREG_ADDR(fxsave, (i - tos) & 7);
202
203 switch (st->exponent & 0x7fff) {
204 case 0x7fff:
205 tag = FP_EXP_TAG_SPECIAL;
206 break;
207 case 0x0000:
208 if (!st->significand[0] &&
209 !st->significand[1] &&
210 !st->significand[2] &&
211 !st->significand[3])
212 tag = FP_EXP_TAG_ZERO;
213 else
214 tag = FP_EXP_TAG_SPECIAL;
215 break;
216 default:
217 if (st->significand[3] & 0x8000)
218 tag = FP_EXP_TAG_VALID;
219 else
220 tag = FP_EXP_TAG_SPECIAL;
221 break;
222 }
223 } else {
224 tag = FP_EXP_TAG_EMPTY;
225 }
226 ret |= tag << (2 * i);
227 }
228 return ret;
229}
230
231/*
232 * FXSR floating point environment conversions.
233 */
234
235static void convert_from_fxsr(struct user_i387_ia32_struct *env,
236 struct task_struct *tsk)
237{
238 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
239 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
240 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
241 int i;
242
243 env->cwd = fxsave->cwd | 0xffff0000u;
244 env->swd = fxsave->swd | 0xffff0000u;
245 env->twd = twd_fxsr_to_i387(fxsave);
246
247#ifdef CONFIG_X86_64
248 env->fip = fxsave->rip;
249 env->foo = fxsave->rdp;
250 if (tsk == current) {
251 /*
252 * should be actually ds/cs at fpu exception time, but
253 * that information is not available in 64bit mode.
254 */
255 asm("mov %%ds,%0" : "=r" (env->fos));
256 asm("mov %%cs,%0" : "=r" (env->fcs));
257 } else {
258 struct pt_regs *regs = task_pt_regs(tsk);
259 env->fos = 0xffff0000 | tsk->thread.ds;
260 env->fcs = regs->cs;
261 }
262#else
263 env->fip = fxsave->fip;
264 env->fcs = fxsave->fcs;
265 env->foo = fxsave->foo;
266 env->fos = fxsave->fos;
267#endif
268
269 for (i = 0; i < 8; ++i)
270 memcpy(&to[i], &from[i], sizeof(to[0]));
271}
272
273static void convert_to_fxsr(struct task_struct *tsk,
274 const struct user_i387_ia32_struct *env)
275
276{
277 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
278 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
279 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
280 int i;
281
282 fxsave->cwd = env->cwd;
283 fxsave->swd = env->swd;
284 fxsave->twd = twd_i387_to_fxsr(env->twd);
285 fxsave->fop = (u16) ((u32) env->fcs >> 16);
286#ifdef CONFIG_X86_64
287 fxsave->rip = env->fip;
288 fxsave->rdp = env->foo;
289 /* cs and ds ignored */
290#else
291 fxsave->fip = env->fip;
292 fxsave->fcs = (env->fcs & 0xffff);
293 fxsave->foo = env->foo;
294 fxsave->fos = env->fos;
295#endif
296
297 for (i = 0; i < 8; ++i)
298 memcpy(&to[i], &from[i], sizeof(from[0]));
299}
300
301int fpregs_get(struct task_struct *target, const struct user_regset *regset,
302 unsigned int pos, unsigned int count,
303 void *kbuf, void __user *ubuf)
304{
305 struct user_i387_ia32_struct env;
306
307 if (!HAVE_HWFP)
308 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
309
310 unlazy_fpu(target);
311
312 if (!cpu_has_fxsr)
313 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
314 &target->thread.i387.fsave, 0, -1);
315
316 if (kbuf && pos == 0 && count == sizeof(env)) {
317 convert_from_fxsr(kbuf, target);
318 return 0;
319 }
320
321 convert_from_fxsr(&env, target);
322 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
323}
324
325int fpregs_set(struct task_struct *target, const struct user_regset *regset,
326 unsigned int pos, unsigned int count,
327 const void *kbuf, const void __user *ubuf)
328{
329 struct user_i387_ia32_struct env;
330 int ret;
331
332 if (!HAVE_HWFP)
333 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
334
335 unlazy_fpu(target);
336 set_stopped_child_used_math(target);
337
338 if (!cpu_has_fxsr)
339 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
340 &target->thread.i387.fsave, 0, -1);
341
342 if (pos > 0 || count < sizeof(env))
343 convert_from_fxsr(&env, target);
344
345 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
346 if (!ret)
347 convert_to_fxsr(target, &env);
348
349 return ret;
350}
351
352/*
353 * Signal frame handlers.
354 */
355
356static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
357{
358 struct task_struct *tsk = current;
359
360 unlazy_fpu(tsk);
361 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
362 if (__copy_to_user(buf, &tsk->thread.i387.fsave,
363 sizeof(struct i387_fsave_struct)))
364 return -1;
365 return 1;
366}
367
368static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
369{
370 struct task_struct *tsk = current;
371 struct user_i387_ia32_struct env;
372 int err = 0;
373
374 unlazy_fpu(tsk);
375
376 convert_from_fxsr(&env, tsk);
377 if (__copy_to_user(buf, &env, sizeof(env)))
378 return -1;
379
380 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
381 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
382 if (err)
383 return -1;
384
385 if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
386 sizeof(struct i387_fxsave_struct)))
387 return -1;
388 return 1;
389}
390
391int save_i387_ia32(struct _fpstate_ia32 __user *buf)
392{
393 if (!used_math())
394 return 0;
395
396 /* This will cause a "finit" to be triggered by the next
397 * attempted FPU operation by the 'current' process.
398 */
399 clear_used_math();
400
401 if (HAVE_HWFP) {
402 if (cpu_has_fxsr) {
403 return save_i387_fxsave(buf);
404 } else {
405 return save_i387_fsave(buf);
406 }
407 } else {
408 return fpregs_soft_get(current, NULL,
409 0, sizeof(struct user_i387_ia32_struct),
410 NULL, buf) ? -1 : 1;
411 }
412}
413
414static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
415{
416 struct task_struct *tsk = current;
417 clear_fpu(tsk);
418 return __copy_from_user(&tsk->thread.i387.fsave, buf,
419 sizeof(struct i387_fsave_struct));
420}
421
422static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
423{
424 int err;
425 struct task_struct *tsk = current;
426 struct user_i387_ia32_struct env;
427 clear_fpu(tsk);
428 err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
429 sizeof(struct i387_fxsave_struct));
430 /* mxcsr reserved bits must be masked to zero for security reasons */
431 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
432 if (err || __copy_from_user(&env, buf, sizeof(env)))
433 return 1;
434 convert_to_fxsr(tsk, &env);
435 return 0;
436}
437
438int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
439{
440 int err;
441
442 if (HAVE_HWFP) {
443 if (cpu_has_fxsr) {
444 err = restore_i387_fxsave(buf);
445 } else {
446 err = restore_i387_fsave(buf);
447 }
448 } else {
449 err = fpregs_soft_set(current, NULL,
450 0, sizeof(struct user_i387_ia32_struct),
451 NULL, buf) != 0;
452 }
453 set_used_math();
454 return err;
455}
456
457/*
458 * FPU state for core dumps.
459 * This is only used for a.out dumps now.
460 * It is declared generically using elf_fpregset_t (which is
461 * struct user_i387_struct) but is in fact only used for 32-bit
462 * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
463 */
464int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
465{
466 int fpvalid;
467 struct task_struct *tsk = current;
468
469 fpvalid = !!used_math();
470 if (fpvalid)
471 fpvalid = !fpregs_get(tsk, NULL,
472 0, sizeof(struct user_i387_ia32_struct),
473 fpu, NULL);
474
475 return fpvalid;
476}
477EXPORT_SYMBOL(dump_fpu);
478
479#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
deleted file mode 100644
index 7d2e12f6c78b..000000000000
--- a/arch/x86/kernel/i387_32.c
+++ /dev/null
@@ -1,544 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <asm/processor.h>
12#include <asm/i387.h>
13#include <asm/math_emu.h>
14#include <asm/sigcontext.h>
15#include <asm/user.h>
16#include <asm/ptrace.h>
17#include <asm/uaccess.h>
18
19#ifdef CONFIG_MATH_EMULATION
20#define HAVE_HWFP (boot_cpu_data.hard_math)
21#else
22#define HAVE_HWFP 1
23#endif
24
25static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
26
27void mxcsr_feature_mask_init(void)
28{
29 unsigned long mask = 0;
30 clts();
31 if (cpu_has_fxsr) {
32 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
33 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
34 mask = current->thread.i387.fxsave.mxcsr_mask;
35 if (mask == 0) mask = 0x0000ffbf;
36 }
37 mxcsr_feature_mask &= mask;
38 stts();
39}
40
41/*
42 * The _current_ task is using the FPU for the first time
43 * so initialize it and set the mxcsr to its default
44 * value at reset if we support XMM instructions and then
45 * remeber the current task has used the FPU.
46 */
47void init_fpu(struct task_struct *tsk)
48{
49 if (cpu_has_fxsr) {
50 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
51 tsk->thread.i387.fxsave.cwd = 0x37f;
52 if (cpu_has_xmm)
53 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
54 } else {
55 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
56 tsk->thread.i387.fsave.cwd = 0xffff037fu;
57 tsk->thread.i387.fsave.swd = 0xffff0000u;
58 tsk->thread.i387.fsave.twd = 0xffffffffu;
59 tsk->thread.i387.fsave.fos = 0xffff0000u;
60 }
61 /* only the device not available exception or ptrace can call init_fpu */
62 set_stopped_child_used_math(tsk);
63}
64
65/*
66 * FPU lazy state save handling.
67 */
68
69void kernel_fpu_begin(void)
70{
71 struct thread_info *thread = current_thread_info();
72
73 preempt_disable();
74 if (thread->status & TS_USEDFPU) {
75 __save_init_fpu(thread->task);
76 return;
77 }
78 clts();
79}
80EXPORT_SYMBOL_GPL(kernel_fpu_begin);
81
82/*
83 * FPU tag word conversions.
84 */
85
86static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
87{
88 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
89
90 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
91 tmp = ~twd;
92 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
93 /* and move the valid bits to the lower byte. */
94 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
95 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
96 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
97 return tmp;
98}
99
100static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
101{
102 struct _fpxreg *st = NULL;
103 unsigned long tos = (fxsave->swd >> 11) & 7;
104 unsigned long twd = (unsigned long) fxsave->twd;
105 unsigned long tag;
106 unsigned long ret = 0xffff0000u;
107 int i;
108
109#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
110
111 for ( i = 0 ; i < 8 ; i++ ) {
112 if ( twd & 0x1 ) {
113 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
114
115 switch ( st->exponent & 0x7fff ) {
116 case 0x7fff:
117 tag = 2; /* Special */
118 break;
119 case 0x0000:
120 if ( !st->significand[0] &&
121 !st->significand[1] &&
122 !st->significand[2] &&
123 !st->significand[3] ) {
124 tag = 1; /* Zero */
125 } else {
126 tag = 2; /* Special */
127 }
128 break;
129 default:
130 if ( st->significand[3] & 0x8000 ) {
131 tag = 0; /* Valid */
132 } else {
133 tag = 2; /* Special */
134 }
135 break;
136 }
137 } else {
138 tag = 3; /* Empty */
139 }
140 ret |= (tag << (2 * i));
141 twd = twd >> 1;
142 }
143 return ret;
144}
145
146/*
147 * FPU state interaction.
148 */
149
150unsigned short get_fpu_cwd( struct task_struct *tsk )
151{
152 if ( cpu_has_fxsr ) {
153 return tsk->thread.i387.fxsave.cwd;
154 } else {
155 return (unsigned short)tsk->thread.i387.fsave.cwd;
156 }
157}
158
159unsigned short get_fpu_swd( struct task_struct *tsk )
160{
161 if ( cpu_has_fxsr ) {
162 return tsk->thread.i387.fxsave.swd;
163 } else {
164 return (unsigned short)tsk->thread.i387.fsave.swd;
165 }
166}
167
168#if 0
169unsigned short get_fpu_twd( struct task_struct *tsk )
170{
171 if ( cpu_has_fxsr ) {
172 return tsk->thread.i387.fxsave.twd;
173 } else {
174 return (unsigned short)tsk->thread.i387.fsave.twd;
175 }
176}
177#endif /* 0 */
178
179unsigned short get_fpu_mxcsr( struct task_struct *tsk )
180{
181 if ( cpu_has_xmm ) {
182 return tsk->thread.i387.fxsave.mxcsr;
183 } else {
184 return 0x1f80;
185 }
186}
187
188#if 0
189
190void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
191{
192 if ( cpu_has_fxsr ) {
193 tsk->thread.i387.fxsave.cwd = cwd;
194 } else {
195 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
196 }
197}
198
199void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
200{
201 if ( cpu_has_fxsr ) {
202 tsk->thread.i387.fxsave.swd = swd;
203 } else {
204 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
205 }
206}
207
208void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
209{
210 if ( cpu_has_fxsr ) {
211 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
212 } else {
213 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
214 }
215}
216
217#endif /* 0 */
218
219/*
220 * FXSR floating point environment conversions.
221 */
222
223static int convert_fxsr_to_user( struct _fpstate __user *buf,
224 struct i387_fxsave_struct *fxsave )
225{
226 unsigned long env[7];
227 struct _fpreg __user *to;
228 struct _fpxreg *from;
229 int i;
230
231 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
232 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
233 env[2] = twd_fxsr_to_i387(fxsave);
234 env[3] = fxsave->fip;
235 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
236 env[5] = fxsave->foo;
237 env[6] = fxsave->fos;
238
239 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
240 return 1;
241
242 to = &buf->_st[0];
243 from = (struct _fpxreg *) &fxsave->st_space[0];
244 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
245 unsigned long __user *t = (unsigned long __user *)to;
246 unsigned long *f = (unsigned long *)from;
247
248 if (__put_user(*f, t) ||
249 __put_user(*(f + 1), t + 1) ||
250 __put_user(from->exponent, &to->exponent))
251 return 1;
252 }
253 return 0;
254}
255
256static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
257 struct _fpstate __user *buf )
258{
259 unsigned long env[7];
260 struct _fpxreg *to;
261 struct _fpreg __user *from;
262 int i;
263
264 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
265 return 1;
266
267 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
268 fxsave->swd = (unsigned short)(env[1] & 0xffff);
269 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
270 fxsave->fip = env[3];
271 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
272 fxsave->fcs = (env[4] & 0xffff);
273 fxsave->foo = env[5];
274 fxsave->fos = env[6];
275
276 to = (struct _fpxreg *) &fxsave->st_space[0];
277 from = &buf->_st[0];
278 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
279 unsigned long *t = (unsigned long *)to;
280 unsigned long __user *f = (unsigned long __user *)from;
281
282 if (__get_user(*t, f) ||
283 __get_user(*(t + 1), f + 1) ||
284 __get_user(to->exponent, &from->exponent))
285 return 1;
286 }
287 return 0;
288}
289
290/*
291 * Signal frame handlers.
292 */
293
294static inline int save_i387_fsave( struct _fpstate __user *buf )
295{
296 struct task_struct *tsk = current;
297
298 unlazy_fpu( tsk );
299 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
300 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
301 sizeof(struct i387_fsave_struct) ) )
302 return -1;
303 return 1;
304}
305
306static int save_i387_fxsave( struct _fpstate __user *buf )
307{
308 struct task_struct *tsk = current;
309 int err = 0;
310
311 unlazy_fpu( tsk );
312
313 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
314 return -1;
315
316 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
317 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
318 if ( err )
319 return -1;
320
321 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
322 sizeof(struct i387_fxsave_struct) ) )
323 return -1;
324 return 1;
325}
326
327int save_i387( struct _fpstate __user *buf )
328{
329 if ( !used_math() )
330 return 0;
331
332 /* This will cause a "finit" to be triggered by the next
333 * attempted FPU operation by the 'current' process.
334 */
335 clear_used_math();
336
337 if ( HAVE_HWFP ) {
338 if ( cpu_has_fxsr ) {
339 return save_i387_fxsave( buf );
340 } else {
341 return save_i387_fsave( buf );
342 }
343 } else {
344 return save_i387_soft( &current->thread.i387.soft, buf );
345 }
346}
347
348static inline int restore_i387_fsave( struct _fpstate __user *buf )
349{
350 struct task_struct *tsk = current;
351 clear_fpu( tsk );
352 return __copy_from_user( &tsk->thread.i387.fsave, buf,
353 sizeof(struct i387_fsave_struct) );
354}
355
356static int restore_i387_fxsave( struct _fpstate __user *buf )
357{
358 int err;
359 struct task_struct *tsk = current;
360 clear_fpu( tsk );
361 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
362 sizeof(struct i387_fxsave_struct) );
363 /* mxcsr reserved bits must be masked to zero for security reasons */
364 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
365 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
366}
367
368int restore_i387( struct _fpstate __user *buf )
369{
370 int err;
371
372 if ( HAVE_HWFP ) {
373 if ( cpu_has_fxsr ) {
374 err = restore_i387_fxsave( buf );
375 } else {
376 err = restore_i387_fsave( buf );
377 }
378 } else {
379 err = restore_i387_soft( &current->thread.i387.soft, buf );
380 }
381 set_used_math();
382 return err;
383}
384
385/*
386 * ptrace request handlers.
387 */
388
389static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
390 struct task_struct *tsk )
391{
392 return __copy_to_user( buf, &tsk->thread.i387.fsave,
393 sizeof(struct user_i387_struct) );
394}
395
396static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
397 struct task_struct *tsk )
398{
399 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
400 &tsk->thread.i387.fxsave );
401}
402
403int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
404{
405 if ( HAVE_HWFP ) {
406 if ( cpu_has_fxsr ) {
407 return get_fpregs_fxsave( buf, tsk );
408 } else {
409 return get_fpregs_fsave( buf, tsk );
410 }
411 } else {
412 return save_i387_soft( &tsk->thread.i387.soft,
413 (struct _fpstate __user *)buf );
414 }
415}
416
417static inline int set_fpregs_fsave( struct task_struct *tsk,
418 struct user_i387_struct __user *buf )
419{
420 return __copy_from_user( &tsk->thread.i387.fsave, buf,
421 sizeof(struct user_i387_struct) );
422}
423
424static inline int set_fpregs_fxsave( struct task_struct *tsk,
425 struct user_i387_struct __user *buf )
426{
427 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
428 (struct _fpstate __user *)buf );
429}
430
431int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
432{
433 if ( HAVE_HWFP ) {
434 if ( cpu_has_fxsr ) {
435 return set_fpregs_fxsave( tsk, buf );
436 } else {
437 return set_fpregs_fsave( tsk, buf );
438 }
439 } else {
440 return restore_i387_soft( &tsk->thread.i387.soft,
441 (struct _fpstate __user *)buf );
442 }
443}
444
445int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
446{
447 if ( cpu_has_fxsr ) {
448 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
449 sizeof(struct user_fxsr_struct) ))
450 return -EFAULT;
451 return 0;
452 } else {
453 return -EIO;
454 }
455}
456
457int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
458{
459 int ret = 0;
460
461 if ( cpu_has_fxsr ) {
462 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
463 sizeof(struct user_fxsr_struct) ))
464 ret = -EFAULT;
465 /* mxcsr reserved bits must be masked to zero for security reasons */
466 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
467 } else {
468 ret = -EIO;
469 }
470 return ret;
471}
472
473/*
474 * FPU state for core dumps.
475 */
476
477static inline void copy_fpu_fsave( struct task_struct *tsk,
478 struct user_i387_struct *fpu )
479{
480 memcpy( fpu, &tsk->thread.i387.fsave,
481 sizeof(struct user_i387_struct) );
482}
483
484static inline void copy_fpu_fxsave( struct task_struct *tsk,
485 struct user_i387_struct *fpu )
486{
487 unsigned short *to;
488 unsigned short *from;
489 int i;
490
491 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
492
493 to = (unsigned short *)&fpu->st_space[0];
494 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
495 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
496 memcpy( to, from, 5 * sizeof(unsigned short) );
497 }
498}
499
500int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
501{
502 int fpvalid;
503 struct task_struct *tsk = current;
504
505 fpvalid = !!used_math();
506 if ( fpvalid ) {
507 unlazy_fpu( tsk );
508 if ( cpu_has_fxsr ) {
509 copy_fpu_fxsave( tsk, fpu );
510 } else {
511 copy_fpu_fsave( tsk, fpu );
512 }
513 }
514
515 return fpvalid;
516}
517EXPORT_SYMBOL(dump_fpu);
518
519int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
520{
521 int fpvalid = !!tsk_used_math(tsk);
522
523 if (fpvalid) {
524 if (tsk == current)
525 unlazy_fpu(tsk);
526 if (cpu_has_fxsr)
527 copy_fpu_fxsave(tsk, fpu);
528 else
529 copy_fpu_fsave(tsk, fpu);
530 }
531 return fpvalid;
532}
533
534int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
535{
536 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
537
538 if (fpvalid) {
539 if (tsk == current)
540 unlazy_fpu(tsk);
541 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
542 }
543 return fpvalid;
544}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
deleted file mode 100644
index bfaff28fb134..000000000000
--- a/arch/x86/kernel/i387_64.c
+++ /dev/null
@@ -1,150 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 * Copyright (C) 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * General FPU state handling cleanups
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * x86-64 rework 2002 Andi Kleen.
10 * Does direct fxsave in and out of user space now for signal handlers.
11 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
12 * the 64bit user space sees a FXSAVE frame directly.
13 */
14
15#include <linux/sched.h>
16#include <linux/init.h>
17#include <asm/processor.h>
18#include <asm/i387.h>
19#include <asm/sigcontext.h>
20#include <asm/user.h>
21#include <asm/ptrace.h>
22#include <asm/uaccess.h>
23
24unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
25
26void mxcsr_feature_mask_init(void)
27{
28 unsigned int mask;
29 clts();
30 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
31 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
32 mask = current->thread.i387.fxsave.mxcsr_mask;
33 if (mask == 0) mask = 0x0000ffbf;
34 mxcsr_feature_mask &= mask;
35 stts();
36}
37
38/*
39 * Called at bootup to set up the initial FPU state that is later cloned
40 * into all processes.
41 */
42void __cpuinit fpu_init(void)
43{
44 unsigned long oldcr0 = read_cr0();
45 extern void __bad_fxsave_alignment(void);
46
47 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
48 __bad_fxsave_alignment();
49 set_in_cr4(X86_CR4_OSFXSR);
50 set_in_cr4(X86_CR4_OSXMMEXCPT);
51
52 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
53
54 mxcsr_feature_mask_init();
55 /* clean state in init */
56 current_thread_info()->status = 0;
57 clear_used_math();
58}
59
60void init_fpu(struct task_struct *child)
61{
62 if (tsk_used_math(child)) {
63 if (child == current)
64 unlazy_fpu(child);
65 return;
66 }
67 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
68 child->thread.i387.fxsave.cwd = 0x37f;
69 child->thread.i387.fxsave.mxcsr = 0x1f80;
70 /* only the device not available exception or ptrace can call init_fpu */
71 set_stopped_child_used_math(child);
72}
73
74/*
75 * Signal frame handlers.
76 */
77
78int save_i387(struct _fpstate __user *buf)
79{
80 struct task_struct *tsk = current;
81 int err = 0;
82
83 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
84 sizeof(tsk->thread.i387.fxsave));
85
86 if ((unsigned long)buf % 16)
87 printk("save_i387: bad fpstate %p\n",buf);
88
89 if (!used_math())
90 return 0;
91 clear_used_math(); /* trigger finit */
92 if (task_thread_info(tsk)->status & TS_USEDFPU) {
93 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
94 if (err) return err;
95 task_thread_info(tsk)->status &= ~TS_USEDFPU;
96 stts();
97 } else {
98 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
99 sizeof(struct i387_fxsave_struct)))
100 return -1;
101 }
102 return 1;
103}
104
105/*
106 * ptrace request handlers.
107 */
108
109int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
110{
111 init_fpu(tsk);
112 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
113 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
114}
115
116int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
117{
118 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
119 sizeof(struct user_i387_struct)))
120 return -EFAULT;
121 return 0;
122}
123
124/*
125 * FPU state for core dumps.
126 */
127
128int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
129{
130 struct task_struct *tsk = current;
131
132 if (!used_math())
133 return 0;
134
135 unlazy_fpu(tsk);
136 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
137 return 1;
138}
139
140int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
141{
142 int fpvalid = !!tsk_used_math(tsk);
143
144 if (fpvalid) {
145 if (tsk == current)
146 unlazy_fpu(tsk);
147 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
148}
149 return fpvalid;
150}
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index a42c80745325..ef62b07b2b48 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,10 +13,17 @@
13#include <asm/delay.h> 13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/hpet.h>
16 17
17DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
18EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
19 20
21#ifdef CONFIG_X86_32
22static void pit_disable_clocksource(void);
23#else
24static inline void pit_disable_clocksource(void) { }
25#endif
26
20/* 27/*
21 * HPET replaces the PIT, when enabled. So we need to know, which of 28 * HPET replaces the PIT, when enabled. So we need to know, which of
22 * the two timers is used 29 * the two timers is used
@@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event;
31static void init_pit_timer(enum clock_event_mode mode, 38static void init_pit_timer(enum clock_event_mode mode,
32 struct clock_event_device *evt) 39 struct clock_event_device *evt)
33{ 40{
34 unsigned long flags; 41 spin_lock(&i8253_lock);
35
36 spin_lock_irqsave(&i8253_lock, flags);
37 42
38 switch(mode) { 43 switch(mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
40 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
41 outb_p(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
42 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ 47 outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
43 outb(LATCH >> 8 , PIT_CH0); /* MSB */ 48 outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
44 break; 49 break;
45 50
46 case CLOCK_EVT_MODE_SHUTDOWN: 51 case CLOCK_EVT_MODE_SHUTDOWN:
47 case CLOCK_EVT_MODE_UNUSED: 52 case CLOCK_EVT_MODE_UNUSED:
48 if (evt->mode == CLOCK_EVT_MODE_PERIODIC || 53 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
49 evt->mode == CLOCK_EVT_MODE_ONESHOT) { 54 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
50 outb_p(0x30, PIT_MODE); 55 outb_pit(0x30, PIT_MODE);
51 outb_p(0, PIT_CH0); 56 outb_pit(0, PIT_CH0);
52 outb_p(0, PIT_CH0); 57 outb_pit(0, PIT_CH0);
53 } 58 }
59 pit_disable_clocksource();
54 break; 60 break;
55 61
56 case CLOCK_EVT_MODE_ONESHOT: 62 case CLOCK_EVT_MODE_ONESHOT:
57 /* One shot setup */ 63 /* One shot setup */
58 outb_p(0x38, PIT_MODE); 64 pit_disable_clocksource();
65 outb_pit(0x38, PIT_MODE);
59 break; 66 break;
60 67
61 case CLOCK_EVT_MODE_RESUME: 68 case CLOCK_EVT_MODE_RESUME:
62 /* Nothing to do here */ 69 /* Nothing to do here */
63 break; 70 break;
64 } 71 }
65 spin_unlock_irqrestore(&i8253_lock, flags); 72 spin_unlock(&i8253_lock);
66} 73}
67 74
68/* 75/*
@@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 79 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 80static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 81{
75 unsigned long flags; 82 spin_lock(&i8253_lock);
76 83 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 spin_lock_irqsave(&i8253_lock, flags); 84 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 outb_p(delta & 0xff , PIT_CH0); /* LSB */ 85 spin_unlock(&i8253_lock);
79 outb(delta >> 8 , PIT_CH0); /* MSB */
80 spin_unlock_irqrestore(&i8253_lock, flags);
81 86
82 return 0; 87 return 0;
83} 88}
@@ -148,15 +153,15 @@ static cycle_t pit_read(void)
148 * count), it cannot be newer. 153 * count), it cannot be newer.
149 */ 154 */
150 jifs = jiffies; 155 jifs = jiffies;
151 outb_p(0x00, PIT_MODE); /* latch the count ASAP */ 156 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
152 count = inb_p(PIT_CH0); /* read the latched count */ 157 count = inb_pit(PIT_CH0); /* read the latched count */
153 count |= inb_p(PIT_CH0) << 8; 158 count |= inb_pit(PIT_CH0) << 8;
154 159
155 /* VIA686a test code... reset the latch if count > max + 1 */ 160 /* VIA686a test code... reset the latch if count > max + 1 */
156 if (count > LATCH) { 161 if (count > LATCH) {
157 outb_p(0x34, PIT_MODE); 162 outb_pit(0x34, PIT_MODE);
158 outb_p(LATCH & 0xff, PIT_CH0); 163 outb_pit(LATCH & 0xff, PIT_CH0);
159 outb(LATCH >> 8, PIT_CH0); 164 outb_pit(LATCH >> 8, PIT_CH0);
160 count = LATCH - 1; 165 count = LATCH - 1;
161 } 166 }
162 167
@@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = {
195 .shift = 20, 200 .shift = 20,
196}; 201};
197 202
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (clocksource_pit.mult) {
209 clocksource_unregister(&clocksource_pit);
210 clocksource_pit.mult = 0;
211 }
212}
213
198static int __init init_pit_clocksource(void) 214static int __init init_pit_clocksource(void)
199{ 215{
200 if (num_possible_cpus() > 1) /* PIT does not scale! */ 216 /*
217 * Several reasons not to register PIT as a clocksource:
218 *
219 * - On SMP PIT does not scale due to i8253_lock
220 * - when HPET is enabled
221 * - when local APIC timer is active (PIT is switched off)
222 */
223 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
224 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
201 return 0; 225 return 0;
202 226
203 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); 227 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index 5f3496d01984..2d25b77102fe 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,8 +21,6 @@
21#include <asm/arch_hooks.h> 21#include <asm/arch_hooks.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23 23
24#include <io_ports.h>
25
26/* 24/*
27 * This is the 'legacy' 8259A Programmable Interrupt Controller, 25 * This is the 'legacy' 8259A Programmable Interrupt Controller,
28 * present in the majority of PC/AT boxes. 26 * present in the majority of PC/AT boxes.
@@ -291,20 +289,20 @@ void init_8259A(int auto_eoi)
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 289 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 290
293 /* 291 /*
294 * outb_p - this has to work on a wide range of PC hardware. 292 * outb_pic - this has to work on a wide range of PC hardware.
295 */ 293 */
296 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 294 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
297 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ 295 outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
298 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ 296 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
299 if (auto_eoi) /* master does Auto EOI */ 297 if (auto_eoi) /* master does Auto EOI */
300 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); 298 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
301 else /* master expects normal EOI */ 299 else /* master expects normal EOI */
302 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); 300 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
303 301
304 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ 302 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
305 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ 303 outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
306 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ 304 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
307 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ 305 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
308 if (auto_eoi) 306 if (auto_eoi)
309 /* 307 /*
310 * In AEOI mode we just have to mask the interrupt 308 * In AEOI mode we just have to mask the interrupt
@@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
341 outb(0,0xF0); 339 outb(0,0xF0);
342 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 340 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
343 return IRQ_NONE; 341 return IRQ_NONE;
344 math_error((void __user *)get_irq_regs()->eip); 342 math_error((void __user *)get_irq_regs()->ip);
345 return IRQ_HANDLED; 343 return IRQ_HANDLED;
346} 344}
347 345
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index ba6d57286f56..fa57a1568508 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -21,6 +21,7 @@
21#include <asm/delay.h> 21#include <asm/delay.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/i8259.h>
24 25
25/* 26/*
26 * Common place to define all x86 IRQ vectors 27 * Common place to define all x86 IRQ vectors
@@ -48,7 +49,7 @@
48 */ 49 */
49 50
50/* 51/*
51 * The IO-APIC gives us many more interrupt sources. Most of these 52 * The IO-APIC gives us many more interrupt sources. Most of these
52 * are unused but an SMP system is supposed to have enough memory ... 53 * are unused but an SMP system is supposed to have enough memory ...
53 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all 54 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
54 * across the spectrum, so we really want to be prepared to get all 55 * across the spectrum, so we really want to be prepared to get all
@@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
76 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) 77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
77 78
78/* for the irq vectors */ 79/* for the irq vectors */
79static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { 80static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3), 81 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = {
114/* 115/*
115 * This contains the irq mask for both 8259A irq controllers, 116 * This contains the irq mask for both 8259A irq controllers,
116 */ 117 */
117static unsigned int cached_irq_mask = 0xffff; 118unsigned int cached_irq_mask = 0xffff;
118
119#define __byte(x,y) (((unsigned char *)&(y))[x])
120#define cached_21 (__byte(0,cached_irq_mask))
121#define cached_A1 (__byte(1,cached_irq_mask))
122 119
123/* 120/*
124 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) 121 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
@@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq)
139 spin_lock_irqsave(&i8259A_lock, flags); 136 spin_lock_irqsave(&i8259A_lock, flags);
140 cached_irq_mask |= mask; 137 cached_irq_mask |= mask;
141 if (irq & 8) 138 if (irq & 8)
142 outb(cached_A1,0xA1); 139 outb(cached_slave_mask, PIC_SLAVE_IMR);
143 else 140 else
144 outb(cached_21,0x21); 141 outb(cached_master_mask, PIC_MASTER_IMR);
145 spin_unlock_irqrestore(&i8259A_lock, flags); 142 spin_unlock_irqrestore(&i8259A_lock, flags);
146} 143}
147 144
@@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq)
153 spin_lock_irqsave(&i8259A_lock, flags); 150 spin_lock_irqsave(&i8259A_lock, flags);
154 cached_irq_mask &= mask; 151 cached_irq_mask &= mask;
155 if (irq & 8) 152 if (irq & 8)
156 outb(cached_A1,0xA1); 153 outb(cached_slave_mask, PIC_SLAVE_IMR);
157 else 154 else
158 outb(cached_21,0x21); 155 outb(cached_master_mask, PIC_MASTER_IMR);
159 spin_unlock_irqrestore(&i8259A_lock, flags); 156 spin_unlock_irqrestore(&i8259A_lock, flags);
160} 157}
161 158
@@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq)
167 164
168 spin_lock_irqsave(&i8259A_lock, flags); 165 spin_lock_irqsave(&i8259A_lock, flags);
169 if (irq < 8) 166 if (irq < 8)
170 ret = inb(0x20) & mask; 167 ret = inb(PIC_MASTER_CMD) & mask;
171 else 168 else
172 ret = inb(0xA0) & (mask >> 8); 169 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
173 spin_unlock_irqrestore(&i8259A_lock, flags); 170 spin_unlock_irqrestore(&i8259A_lock, flags);
174 171
175 return ret; 172 return ret;
@@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq)
196 int irqmask = 1<<irq; 193 int irqmask = 1<<irq;
197 194
198 if (irq < 8) { 195 if (irq < 8) {
199 outb(0x0B,0x20); /* ISR register */ 196 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
200 value = inb(0x20) & irqmask; 197 value = inb(PIC_MASTER_CMD) & irqmask;
201 outb(0x0A,0x20); /* back to the IRR register */ 198 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
202 return value; 199 return value;
203 } 200 }
204 outb(0x0B,0xA0); /* ISR register */ 201 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
205 value = inb(0xA0) & (irqmask >> 8); 202 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
206 outb(0x0A,0xA0); /* back to the IRR register */ 203 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
207 return value; 204 return value;
208} 205}
209 206
@@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq)
240 237
241handle_real_irq: 238handle_real_irq:
242 if (irq & 8) { 239 if (irq & 8) {
243 inb(0xA1); /* DUMMY - (do we need this?) */ 240 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
244 outb(cached_A1,0xA1); 241 outb(cached_slave_mask, PIC_SLAVE_IMR);
245 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ 242 /* 'Specific EOI' to slave */
246 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ 243 outb(0x60+(irq&7),PIC_SLAVE_CMD);
244 /* 'Specific EOI' to master-IRQ2 */
245 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
247 } else { 246 } else {
248 inb(0x21); /* DUMMY - (do we need this?) */ 247 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
249 outb(cached_21,0x21); 248 outb(cached_master_mask, PIC_MASTER_IMR);
250 outb(0x60+irq,0x20); /* 'Specific EOI' to master */ 249 /* 'Specific EOI' to master */
250 outb(0x60+irq,PIC_MASTER_CMD);
251 } 251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags); 252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return; 253 return;
@@ -270,7 +270,8 @@ spurious_8259A_irq:
270 * lets ACK and report it. [once per IRQ] 270 * lets ACK and report it. [once per IRQ]
271 */ 271 */
272 if (!(spurious_irq_mask & irqmask)) { 272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); 273 printk(KERN_DEBUG
274 "spurious 8259A interrupt: IRQ%d.\n", irq);
274 spurious_irq_mask |= irqmask; 275 spurious_irq_mask |= irqmask;
275 } 276 }
276 atomic_inc(&irq_err_count); 277 atomic_inc(&irq_err_count);
@@ -283,51 +284,6 @@ spurious_8259A_irq:
283 } 284 }
284} 285}
285 286
286void init_8259A(int auto_eoi)
287{
288 unsigned long flags;
289
290 i8259A_auto_eoi = auto_eoi;
291
292 spin_lock_irqsave(&i8259A_lock, flags);
293
294 outb(0xff, 0x21); /* mask all of 8259A-1 */
295 outb(0xff, 0xA1); /* mask all of 8259A-2 */
296
297 /*
298 * outb_p - this has to work on a wide range of PC hardware.
299 */
300 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
301 outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
302 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
303 if (auto_eoi)
304 outb_p(0x03, 0x21); /* master does Auto EOI */
305 else
306 outb_p(0x01, 0x21); /* master expects normal EOI */
307
308 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
309 outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
310 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
311 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
312 is to be investigated) */
313
314 if (auto_eoi)
315 /*
316 * in AEOI mode we just have to mask the interrupt
317 * when acking.
318 */
319 i8259A_chip.mask_ack = disable_8259A_irq;
320 else
321 i8259A_chip.mask_ack = mask_and_ack_8259A;
322
323 udelay(100); /* wait for 8259A to initialize */
324
325 outb(cached_21, 0x21); /* restore master IRQ mask */
326 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
327
328 spin_unlock_irqrestore(&i8259A_lock, flags);
329}
330
331static char irq_trigger[2]; 287static char irq_trigger[2];
332/** 288/**
333 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 289 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -364,8 +320,8 @@ static int i8259A_shutdown(struct sys_device *dev)
364 * the kernel initialization code can get it 320 * the kernel initialization code can get it
365 * out of. 321 * out of.
366 */ 322 */
367 outb(0xff, 0x21); /* mask all of 8259A-1 */ 323 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
368 outb(0xff, 0xA1); /* mask all of 8259A-1 */ 324 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
369 return 0; 325 return 0;
370} 326}
371 327
@@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void)
391 347
392device_initcall(i8259A_init_sysfs); 348device_initcall(i8259A_init_sysfs);
393 349
350void init_8259A(int auto_eoi)
351{
352 unsigned long flags;
353
354 i8259A_auto_eoi = auto_eoi;
355
356 spin_lock_irqsave(&i8259A_lock, flags);
357
358 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
359 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
360
361 /*
362 * outb_pic - this has to work on a wide range of PC hardware.
363 */
364 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
365 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
366 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
367 /* 8259A-1 (the master) has a slave on IR2 */
368 outb_pic(0x04, PIC_MASTER_IMR);
369 if (auto_eoi) /* master does Auto EOI */
370 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
371 else /* master expects normal EOI */
372 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
373
374 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
375 /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
376 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
377 /* 8259A-2 is a slave on master's IR2 */
378 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
379 /* (slave's support for AEOI in flat mode is to be investigated) */
380 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
381
382 if (auto_eoi)
383 /*
384 * In AEOI mode we just have to mask the interrupt
385 * when acking.
386 */
387 i8259A_chip.mask_ack = disable_8259A_irq;
388 else
389 i8259A_chip.mask_ack = mask_and_ack_8259A;
390
391 udelay(100); /* wait for 8259A to initialize */
392
393 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
394 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
395
396 spin_unlock_irqrestore(&i8259A_lock, flags);
397}
398
399
400
401
394/* 402/*
395 * IRQ2 is cascade interrupt to second interrupt controller 403 * IRQ2 is cascade interrupt to second interrupt controller
396 */ 404 */
@@ -448,7 +456,9 @@ void __init init_ISA_irqs (void)
448 } 456 }
449} 457}
450 458
451void __init init_IRQ(void) 459void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
460
461void __init native_init_IRQ(void)
452{ 462{
453 int i; 463 int i;
454 464
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 468c9c437842..5b3ce7934363 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm); 17struct mm_struct init_mm = INIT_MM(init_mm);
18EXPORT_SYMBOL(init_mm);
19 18
20/* 19/*
21 * Initial thread structure. 20 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index ab77f1905469..4ca548632c8d 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -35,6 +35,7 @@
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
38 39
39#include <asm/io.h> 40#include <asm/io.h>
40#include <asm/smp.h> 41#include <asm/smp.h>
@@ -48,8 +49,6 @@
48#include <mach_apic.h> 49#include <mach_apic.h>
49#include <mach_apicdef.h> 50#include <mach_apicdef.h>
50 51
51#include "io_ports.h"
52
53int (*ioapic_renumber_irq)(int ioapic, int irq); 52int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count; 53atomic_t irq_mis_count;
55 54
@@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
351# include <asm/processor.h> /* kernel_thread() */ 350# include <asm/processor.h> /* kernel_thread() */
352# include <linux/kernel_stat.h> /* kstat */ 351# include <linux/kernel_stat.h> /* kstat */
353# include <linux/slab.h> /* kmalloc() */ 352# include <linux/slab.h> /* kmalloc() */
354# include <linux/timer.h> /* time_after() */ 353# include <linux/timer.h>
355 354
356#define IRQBALANCE_CHECK_ARCH -999 355#define IRQBALANCE_CHECK_ARCH -999
357#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) 356#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
@@ -727,7 +726,7 @@ late_initcall(balanced_irq_init);
727#endif /* CONFIG_SMP */ 726#endif /* CONFIG_SMP */
728 727
729#ifndef CONFIG_SMP 728#ifndef CONFIG_SMP
730void fastcall send_IPI_self(int vector) 729void send_IPI_self(int vector)
731{ 730{
732 unsigned int cfg; 731 unsigned int cfg;
733 732
@@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void)
1900 * might have cached one ExtINT interrupt. Finally, at 1899 * might have cached one ExtINT interrupt. Finally, at
1901 * least one tick may be lost due to delays. 1900 * least one tick may be lost due to delays.
1902 */ 1901 */
1903 if (jiffies - t1 > 4) 1902 if (time_after(jiffies, t1 + 4))
1904 return 1; 1903 return 1;
1905 1904
1906 return 0; 1905 return 0;
@@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2080 .eoi = ack_apic, 2079 .eoi = ack_apic,
2081}; 2080};
2082 2081
2083static void setup_nmi (void) 2082static void __init setup_nmi(void)
2084{ 2083{
2085 /* 2084 /*
2086 * Dirty trick to enable the NMI watchdog ... 2085 * Dirty trick to enable the NMI watchdog ...
@@ -2093,7 +2092,7 @@ static void setup_nmi (void)
2093 */ 2092 */
2094 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); 2093 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2095 2094
2096 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); 2095 enable_NMI_through_LVT0();
2097 2096
2098 apic_printk(APIC_VERBOSE, " done.\n"); 2097 apic_printk(APIC_VERBOSE, " done.\n");
2099} 2098}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 23a3ac06a23e..1627c0d53e0b 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -32,9 +32,11 @@
32#include <linux/msi.h> 32#include <linux/msi.h>
33#include <linux/htirq.h> 33#include <linux/htirq.h>
34#include <linux/dmar.h> 34#include <linux/dmar.h>
35#include <linux/jiffies.h>
35#ifdef CONFIG_ACPI 36#ifdef CONFIG_ACPI
36#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
37#endif 38#endif
39#include <linux/bootmem.h>
38 40
39#include <asm/idle.h> 41#include <asm/idle.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1069 v = apic_read(APIC_LVR); 1071 v = apic_read(APIC_LVR);
1070 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1072 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1071 ver = GET_APIC_VERSION(v); 1073 ver = GET_APIC_VERSION(v);
1072 maxlvt = get_maxlvt(); 1074 maxlvt = lapic_get_maxlvt();
1073 1075
1074 v = apic_read(APIC_TASKPRI); 1076 v = apic_read(APIC_TASKPRI);
1075 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1077 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void)
1171 1173
1172#endif /* 0 */ 1174#endif /* 0 */
1173 1175
1174static void __init enable_IO_APIC(void) 1176void __init enable_IO_APIC(void)
1175{ 1177{
1176 union IO_APIC_reg_01 reg_01; 1178 union IO_APIC_reg_01 reg_01;
1177 int i8259_apic, i8259_pin; 1179 int i8259_apic, i8259_pin;
@@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void)
1298 */ 1300 */
1299 1301
1300 /* jiffies wrap? */ 1302 /* jiffies wrap? */
1301 if (jiffies - t1 > 4) 1303 if (time_after(jiffies, t1 + 4))
1302 return 1; 1304 return 1;
1303 return 0; 1305 return 0;
1304} 1306}
@@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq)
1411 if (likely(!cfg->move_in_progress)) 1413 if (likely(!cfg->move_in_progress))
1412 return; 1414 return;
1413 1415
1414 vector = ~get_irq_regs()->orig_rax; 1416 vector = ~get_irq_regs()->orig_ax;
1415 me = smp_processor_id(); 1417 me = smp_processor_id();
1416 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 1418 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1417 cpumask_t cleanup_mask; 1419 cpumask_t cleanup_mask;
@@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq)
1438 int do_unmask_irq = 0; 1440 int do_unmask_irq = 0;
1439 1441
1440 irq_complete_move(irq); 1442 irq_complete_move(irq);
1441#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) 1443#ifdef CONFIG_GENERIC_PENDING_IRQ
1442 /* If we are moving the irq we need to mask it */ 1444 /* If we are moving the irq we need to mask it */
1443 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 1445 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
1444 do_unmask_irq = 1; 1446 do_unmask_irq = 1;
@@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1565 .end = end_lapic_irq, 1567 .end = end_lapic_irq,
1566}; 1568};
1567 1569
1568static void setup_nmi (void) 1570static void __init setup_nmi(void)
1569{ 1571{
1570 /* 1572 /*
1571 * Dirty trick to enable the NMI watchdog ... 1573 * Dirty trick to enable the NMI watchdog ...
@@ -1578,7 +1580,7 @@ static void setup_nmi (void)
1578 */ 1580 */
1579 printk(KERN_INFO "activating NMI Watchdog ..."); 1581 printk(KERN_INFO "activating NMI Watchdog ...");
1580 1582
1581 enable_NMI_through_LVT0(NULL); 1583 enable_NMI_through_LVT0();
1582 1584
1583 printk(" done.\n"); 1585 printk(" done.\n");
1584} 1586}
@@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void)
1654 * 1656 *
1655 * FIXME: really need to revamp this for modern platforms only. 1657 * FIXME: really need to revamp this for modern platforms only.
1656 */ 1658 */
1657static inline void check_timer(void) 1659static inline void __init check_timer(void)
1658{ 1660{
1659 struct irq_cfg *cfg = irq_cfg + 0; 1661 struct irq_cfg *cfg = irq_cfg + 0;
1660 int apic1, pin1, apic2, pin2; 1662 int apic1, pin1, apic2, pin2;
@@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck);
1788 1790
1789void __init setup_IO_APIC(void) 1791void __init setup_IO_APIC(void)
1790{ 1792{
1791 enable_IO_APIC(); 1793
1794 /*
1795 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1796 */
1792 1797
1793 if (acpi_ioapic) 1798 if (acpi_ioapic)
1794 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ 1799 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
@@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void)
2288} 2293}
2289#endif 2294#endif
2290 2295
2296#define IOAPIC_RESOURCE_NAME_SIZE 11
2297
2298static struct resource *ioapic_resources;
2299
2300static struct resource * __init ioapic_setup_resources(void)
2301{
2302 unsigned long n;
2303 struct resource *res;
2304 char *mem;
2305 int i;
2306
2307 if (nr_ioapics <= 0)
2308 return NULL;
2309
2310 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
2311 n *= nr_ioapics;
2312
2313 mem = alloc_bootmem(n);
2314 res = (void *)mem;
2315
2316 if (mem != NULL) {
2317 memset(mem, 0, n);
2318 mem += sizeof(struct resource) * nr_ioapics;
2319
2320 for (i = 0; i < nr_ioapics; i++) {
2321 res[i].name = mem;
2322 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2323 sprintf(mem, "IOAPIC %u", i);
2324 mem += IOAPIC_RESOURCE_NAME_SIZE;
2325 }
2326 }
2327
2328 ioapic_resources = res;
2329
2330 return res;
2331}
2332
2333void __init ioapic_init_mappings(void)
2334{
2335 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2336 struct resource *ioapic_res;
2337 int i;
2338
2339 ioapic_res = ioapic_setup_resources();
2340 for (i = 0; i < nr_ioapics; i++) {
2341 if (smp_found_config) {
2342 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
2343 } else {
2344 ioapic_phys = (unsigned long)
2345 alloc_bootmem_pages(PAGE_SIZE);
2346 ioapic_phys = __pa(ioapic_phys);
2347 }
2348 set_fixmap_nocache(idx, ioapic_phys);
2349 apic_printk(APIC_VERBOSE,
2350 "mapped IOAPIC to %016lx (%016lx)\n",
2351 __fix_to_virt(idx), ioapic_phys);
2352 idx++;
2353
2354 if (ioapic_res != NULL) {
2355 ioapic_res->start = ioapic_phys;
2356 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
2357 ioapic_res++;
2358 }
2359 }
2360}
2361
2362static int __init ioapic_insert_resources(void)
2363{
2364 int i;
2365 struct resource *r = ioapic_resources;
2366
2367 if (!r) {
2368 printk(KERN_ERR
2369 "IO APIC resources could be not be allocated.\n");
2370 return -1;
2371 }
2372
2373 for (i = 0; i < nr_ioapics; i++) {
2374 insert_resource(&iomem_resource, r);
2375 r++;
2376 }
2377
2378 return 0;
2379}
2380
2381/* Insert the IO APIC resources after PCI initialization has occured to handle
2382 * IO APICS that are mapped in on a BAR in PCI space. */
2383late_initcall(ioapic_insert_resources);
2384
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 000000000000..bd49321034db
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,114 @@
1/*
2 * I/O delay strategies for inb_p/outb_p
3 *
4 * Allow for a DMI based override of port 0x80, needed for certain HP laptops
5 * and possibly other systems. Also allow for the gradual elimination of
6 * outb_p/inb_p API uses.
7 */
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h>
12#include <linux/dmi.h>
13#include <asm/io.h>
14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16EXPORT_SYMBOL_GPL(io_delay_type);
17
18static int __initdata io_delay_override;
19
20/*
21 * Paravirt wants native_io_delay to be a constant.
22 */
23void native_io_delay(void)
24{
25 switch (io_delay_type) {
26 default:
27 case CONFIG_IO_DELAY_TYPE_0X80:
28 asm volatile ("outb %al, $0x80");
29 break;
30 case CONFIG_IO_DELAY_TYPE_0XED:
31 asm volatile ("outb %al, $0xed");
32 break;
33 case CONFIG_IO_DELAY_TYPE_UDELAY:
34 /*
35 * 2 usecs is an upper-bound for the outb delay but
36 * note that udelay doesn't have the bus-level
37 * side-effects that outb does, nor does udelay() have
38 * precise timings during very early bootup (the delays
39 * are shorter until calibrated):
40 */
41 udelay(2);
42 case CONFIG_IO_DELAY_TYPE_NONE:
43 break;
44 }
45}
46EXPORT_SYMBOL(native_io_delay);
47
48static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
49{
50 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
51 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
52 id->ident);
53 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
54 }
55
56 return 0;
57}
58
59/*
60 * Quirk table for systems that misbehave (lock up, etc.) if port
61 * 0x80 is used:
62 */
63static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 {
65 .callback = dmi_io_delay_0xed_port,
66 .ident = "Compaq Presario V6000",
67 .matches = {
68 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
69 DMI_MATCH(DMI_BOARD_NAME, "30B7")
70 }
71 },
72 {
73 .callback = dmi_io_delay_0xed_port,
74 .ident = "HP Pavilion dv9000z",
75 .matches = {
76 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
77 DMI_MATCH(DMI_BOARD_NAME, "30B9")
78 }
79 },
80 {
81 .callback = dmi_io_delay_0xed_port,
82 .ident = "HP Pavilion tx1000",
83 .matches = {
84 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
85 DMI_MATCH(DMI_BOARD_NAME, "30BF")
86 }
87 },
88 { }
89};
90
91void __init io_delay_init(void)
92{
93 if (!io_delay_override)
94 dmi_check_system(io_delay_0xed_port_dmi_table);
95}
96
97static int __init io_delay_param(char *s)
98{
99 if (!strcmp(s, "0x80"))
100 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
101 else if (!strcmp(s, "0xed"))
102 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
103 else if (!strcmp(s, "udelay"))
104 io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
105 else if (!strcmp(s, "none"))
106 io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
107 else
108 return -EINVAL;
109
110 io_delay_override = 1;
111 return 0;
112}
113
114early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c
index 4ed48dc8df1e..50e5e4a31c85 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This contains the io-permission bitmap code - written by obz, with changes 2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus. 3 * by Linus. 32/64 bits code unification by Miguel Botón.
4 */ 4 */
5 5
6#include <linux/sched.h> 6#include <linux/sched.h>
@@ -16,49 +16,27 @@
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17 17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 19static void set_bitmap(unsigned long *bitmap, unsigned int base,
20 unsigned int extent, int new_value)
20{ 21{
21 unsigned long mask; 22 unsigned int i;
22 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
23 unsigned int low_index = base & (BITS_PER_LONG-1);
24 int length = low_index + extent;
25
26 if (low_index != 0) {
27 mask = (~0UL << low_index);
28 if (length < BITS_PER_LONG)
29 mask &= ~(~0UL << length);
30 if (new_value)
31 *bitmap_base++ |= mask;
32 else
33 *bitmap_base++ &= ~mask;
34 length -= BITS_PER_LONG;
35 }
36
37 mask = (new_value ? ~0UL : 0UL);
38 while (length >= BITS_PER_LONG) {
39 *bitmap_base++ = mask;
40 length -= BITS_PER_LONG;
41 }
42 23
43 if (length > 0) { 24 for (i = base; i < base + extent; i++) {
44 mask = ~(~0UL << length);
45 if (new_value) 25 if (new_value)
46 *bitmap_base++ |= mask; 26 __set_bit(i, bitmap);
47 else 27 else
48 *bitmap_base++ &= ~mask; 28 __clear_bit(i, bitmap);
49 } 29 }
50} 30}
51 31
52
53/* 32/*
54 * this changes the io permissions bitmap in the current task. 33 * this changes the io permissions bitmap in the current task.
55 */ 34 */
56asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) 35asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
57{ 36{
58 unsigned long i, max_long, bytes, bytes_updated;
59 struct thread_struct * t = &current->thread; 37 struct thread_struct * t = &current->thread;
60 struct tss_struct * tss; 38 struct tss_struct * tss;
61 unsigned long *bitmap; 39 unsigned int i, max_long, bytes, bytes_updated;
62 40
63 if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) 41 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
64 return -EINVAL; 42 return -EINVAL;
@@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
71 * this is why we delay this operation until now: 49 * this is why we delay this operation until now:
72 */ 50 */
73 if (!t->io_bitmap_ptr) { 51 if (!t->io_bitmap_ptr) {
74 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 52 unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
53
75 if (!bitmap) 54 if (!bitmap)
76 return -ENOMEM; 55 return -ENOMEM;
77 56
@@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
100 if (t->io_bitmap_ptr[i] != ~0UL) 79 if (t->io_bitmap_ptr[i] != ~0UL)
101 max_long = i; 80 max_long = i;
102 81
103 bytes = (max_long + 1) * sizeof(long); 82 bytes = (max_long + 1) * sizeof(unsigned long);
104 bytes_updated = max(bytes, t->io_bitmap_max); 83 bytes_updated = max(bytes, t->io_bitmap_max);
105 84
106 t->io_bitmap_max = bytes; 85 t->io_bitmap_max = bytes;
107 86
87#ifdef CONFIG_X86_32
108 /* 88 /*
109 * Sets the lazy trigger so that the next I/O operation will 89 * Sets the lazy trigger so that the next I/O operation will
110 * reload the correct bitmap. 90 * reload the correct bitmap.
@@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
113 */ 93 */
114 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 94 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
115 tss->io_bitmap_owner = NULL; 95 tss->io_bitmap_owner = NULL;
96#else
97 /* Update the TSS: */
98 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
99#endif
116 100
117 put_cpu(); 101 put_cpu();
118 102
@@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
124 * beyond the 0x3ff range: to get the full 65536 ports bitmapped 108 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
125 * you'd need 8kB of bitmaps/process, which is a bit excessive. 109 * you'd need 8kB of bitmaps/process, which is a bit excessive.
126 * 110 *
127 * Here we just change the eflags value on the stack: we allow 111 * Here we just change the flags value on the stack: we allow
128 * only the super-user to do it. This depends on the stack-layout 112 * only the super-user to do it. This depends on the stack-layout
129 * on system-call entry - see also fork() and the signal handling 113 * on system-call entry - see also fork() and the signal handling
130 * code. 114 * code.
131 */ 115 */
132 116static int do_iopl(unsigned int level, struct pt_regs *regs)
133asmlinkage long sys_iopl(unsigned long unused)
134{ 117{
135 volatile struct pt_regs * regs = (struct pt_regs *) &unused; 118 unsigned int old = (regs->flags >> 12) & 3;
136 unsigned int level = regs->ebx;
137 unsigned int old = (regs->eflags >> 12) & 3;
138 struct thread_struct *t = &current->thread;
139 119
140 if (level > 3) 120 if (level > 3)
141 return -EINVAL; 121 return -EINVAL;
@@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused)
144 if (!capable(CAP_SYS_RAWIO)) 124 if (!capable(CAP_SYS_RAWIO))
145 return -EPERM; 125 return -EPERM;
146 } 126 }
127 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
128
129 return 0;
130}
131
132#ifdef CONFIG_X86_32
133asmlinkage long sys_iopl(unsigned long regsp)
134{
135 struct pt_regs *regs = (struct pt_regs *)&regsp;
136 unsigned int level = regs->bx;
137 struct thread_struct *t = &current->thread;
138 int rc;
139
140 rc = do_iopl(level, regs);
141 if (rc < 0)
142 goto out;
143
147 t->iopl = level << 12; 144 t->iopl = level << 12;
148 regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
149 set_iopl_mask(t->iopl); 145 set_iopl_mask(t->iopl);
150 return 0; 146out:
147 return rc;
148}
149#else
150asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
151{
152 return do_iopl(level, regs);
151} 153}
154#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
deleted file mode 100644
index 5f62fad64dab..000000000000
--- a/arch/x86/kernel/ioport_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus.
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/capability.h>
9#include <linux/errno.h>
10#include <linux/types.h>
11#include <linux/ioport.h>
12#include <linux/smp.h>
13#include <linux/stddef.h>
14#include <linux/slab.h>
15#include <linux/thread_info.h>
16#include <linux/syscalls.h>
17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
20{
21 int i;
22 if (new_value)
23 for (i = base; i < base + extent; i++)
24 __set_bit(i, bitmap);
25 else
26 for (i = base; i < base + extent; i++)
27 clear_bit(i, bitmap);
28}
29
30/*
31 * this changes the io permissions bitmap in the current task.
32 */
33asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
34{
35 unsigned int i, max_long, bytes, bytes_updated;
36 struct thread_struct * t = &current->thread;
37 struct tss_struct * tss;
38 unsigned long *bitmap;
39
40 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
41 return -EINVAL;
42 if (turn_on && !capable(CAP_SYS_RAWIO))
43 return -EPERM;
44
45 /*
46 * If it's the first ioperm() call in this thread's lifetime, set the
47 * IO bitmap up. ioperm() is much less timing critical than clone(),
48 * this is why we delay this operation until now:
49 */
50 if (!t->io_bitmap_ptr) {
51 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
52 if (!bitmap)
53 return -ENOMEM;
54
55 memset(bitmap, 0xff, IO_BITMAP_BYTES);
56 t->io_bitmap_ptr = bitmap;
57 set_thread_flag(TIF_IO_BITMAP);
58 }
59
60 /*
61 * do it in the per-thread copy and in the TSS ...
62 *
63 * Disable preemption via get_cpu() - we must not switch away
64 * because the ->io_bitmap_max value must match the bitmap
65 * contents:
66 */
67 tss = &per_cpu(init_tss, get_cpu());
68
69 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
70
71 /*
72 * Search for a (possibly new) maximum. This is simple and stupid,
73 * to keep it obviously correct:
74 */
75 max_long = 0;
76 for (i = 0; i < IO_BITMAP_LONGS; i++)
77 if (t->io_bitmap_ptr[i] != ~0UL)
78 max_long = i;
79
80 bytes = (max_long + 1) * sizeof(long);
81 bytes_updated = max(bytes, t->io_bitmap_max);
82
83 t->io_bitmap_max = bytes;
84
85 /* Update the TSS: */
86 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
87
88 put_cpu();
89
90 return 0;
91}
92
93/*
94 * sys_iopl has to be used when you want to access the IO ports
95 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
96 * you'd need 8kB of bitmaps/process, which is a bit excessive.
97 *
98 * Here we just change the eflags value on the stack: we allow
99 * only the super-user to do it. This depends on the stack-layout
100 * on system-call entry - see also fork() and the signal handling
101 * code.
102 */
103
104asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
105{
106 unsigned int old = (regs->eflags >> 12) & 3;
107
108 if (level > 3)
109 return -EINVAL;
110 /* Trying to gain more privileges? */
111 if (level > old) {
112 if (!capable(CAP_SYS_RAWIO))
113 return -EPERM;
114 }
115 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
116 return 0;
117}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d3fde94f7345..cef054b09d27 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
66 * SMP cross-CPU interrupts have their own specific 66 * SMP cross-CPU interrupts have their own specific
67 * handlers). 67 * handlers).
68 */ 68 */
69fastcall unsigned int do_IRQ(struct pt_regs *regs) 69unsigned int do_IRQ(struct pt_regs *regs)
70{ 70{
71 struct pt_regs *old_regs; 71 struct pt_regs *old_regs;
72 /* high bit used in ret_from_ code */ 72 /* high bit used in ret_from_ code */
73 int irq = ~regs->orig_eax; 73 int irq = ~regs->orig_ax;
74 struct irq_desc *desc = irq_desc + irq; 74 struct irq_desc *desc = irq_desc + irq;
75#ifdef CONFIG_4KSTACKS 75#ifdef CONFIG_4KSTACKS
76 union irq_ctx *curctx, *irqctx; 76 union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
88#ifdef CONFIG_DEBUG_STACKOVERFLOW 88#ifdef CONFIG_DEBUG_STACKOVERFLOW
89 /* Debugging check for stack overflow: is there less than 1KB free? */ 89 /* Debugging check for stack overflow: is there less than 1KB free? */
90 { 90 {
91 long esp; 91 long sp;
92 92
93 __asm__ __volatile__("andl %%esp,%0" : 93 __asm__ __volatile__("andl %%esp,%0" :
94 "=r" (esp) : "0" (THREAD_SIZE - 1)); 94 "=r" (sp) : "0" (THREAD_SIZE - 1));
95 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { 95 if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
96 printk("do_IRQ: stack overflow: %ld\n", 96 printk("do_IRQ: stack overflow: %ld\n",
97 esp - sizeof(struct thread_info)); 97 sp - sizeof(struct thread_info));
98 dump_stack(); 98 dump_stack();
99 } 99 }
100 } 100 }
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
112 * current stack (which is the irq stack already after all) 112 * current stack (which is the irq stack already after all)
113 */ 113 */
114 if (curctx != irqctx) { 114 if (curctx != irqctx) {
115 int arg1, arg2, ebx; 115 int arg1, arg2, bx;
116 116
117 /* build the stack frame on the IRQ stack */ 117 /* build the stack frame on the IRQ stack */
118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK); 128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
129 129
130 asm volatile( 130 asm volatile(
131 " xchgl %%ebx,%%esp \n" 131 " xchgl %%ebx,%%esp \n"
132 " call *%%edi \n" 132 " call *%%edi \n"
133 " movl %%ebx,%%esp \n" 133 " movl %%ebx,%%esp \n"
134 : "=a" (arg1), "=d" (arg2), "=b" (ebx) 134 : "=a" (arg1), "=d" (arg2), "=b" (bx)
135 : "0" (irq), "1" (desc), "2" (isp), 135 : "0" (irq), "1" (desc), "2" (isp),
136 "D" (desc->handle_irq) 136 "D" (desc->handle_irq)
137 : "memory", "cc" 137 : "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b5c730d67b9..3aac15466a91 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,26 @@
20 20
21atomic_t irq_err_count; 21atomic_t irq_err_count;
22 22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
23#ifdef CONFIG_DEBUG_STACKOVERFLOW 43#ifdef CONFIG_DEBUG_STACKOVERFLOW
24/* 44/*
25 * Probabilistic stack overflow check: 45 * Probabilistic stack overflow check:
@@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
33 u64 curbase = (u64)task_stack_page(current); 53 u64 curbase = (u64)task_stack_page(current);
34 static unsigned long warned = -60*HZ; 54 static unsigned long warned = -60*HZ;
35 55
36 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && 56 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
37 regs->rsp < curbase + sizeof(struct thread_info) + 128 && 57 regs->sp < curbase + sizeof(struct thread_info) + 128 &&
38 time_after(jiffies, warned + 60*HZ)) { 58 time_after(jiffies, warned + 60*HZ)) {
39 printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", 59 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
40 current->comm, curbase, regs->rsp); 60 current->comm, curbase, regs->sp);
41 show_stack(NULL,NULL); 61 show_stack(NULL,NULL);
42 warned = jiffies; 62 warned = jiffies;
43 } 63 }
@@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
142 struct pt_regs *old_regs = set_irq_regs(regs); 162 struct pt_regs *old_regs = set_irq_regs(regs);
143 163
144 /* high bit used in ret_from_ code */ 164 /* high bit used in ret_from_ code */
145 unsigned vector = ~regs->orig_rax; 165 unsigned vector = ~regs->orig_ax;
146 unsigned irq; 166 unsigned irq;
147 167
148 exit_idle(); 168 exit_idle();
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 000000000000..73354302fda7
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,65 @@
1/*
2 * Architecture specific debugfs files
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/debugfs.h>
11#include <linux/stat.h>
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16#ifdef CONFIG_DEBUG_BOOT_PARAMS
17static struct debugfs_blob_wrapper boot_params_blob = {
18 .data = &boot_params,
19 .size = sizeof(boot_params),
20};
21
22static int __init boot_params_kdebugfs_init(void)
23{
24 int error;
25 struct dentry *dbp, *version, *data;
26
27 dbp = debugfs_create_dir("boot_params", NULL);
28 if (!dbp) {
29 error = -ENOMEM;
30 goto err_return;
31 }
32 version = debugfs_create_x16("version", S_IRUGO, dbp,
33 &boot_params.hdr.version);
34 if (!version) {
35 error = -ENOMEM;
36 goto err_dir;
37 }
38 data = debugfs_create_blob("data", S_IRUGO, dbp,
39 &boot_params_blob);
40 if (!data) {
41 error = -ENOMEM;
42 goto err_version;
43 }
44 return 0;
45err_version:
46 debugfs_remove(version);
47err_dir:
48 debugfs_remove(dbp);
49err_return:
50 return error;
51}
52#endif
53
54static int __init arch_kdebugfs_init(void)
55{
56 int error = 0;
57
58#ifdef CONFIG_DEBUG_BOOT_PARAMS
59 error = boot_params_kdebugfs_init();
60#endif
61
62 return error;
63}
64
65arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
new file mode 100644
index 000000000000..a99e764fd66a
--- /dev/null
+++ b/arch/x86/kernel/kprobes.c
@@ -0,0 +1,1066 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64 from i386.
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code.
41 */
42
43#include <linux/kprobes.h>
44#include <linux/ptrace.h>
45#include <linux/string.h>
46#include <linux/slab.h>
47#include <linux/hardirq.h>
48#include <linux/preempt.h>
49#include <linux/module.h>
50#include <linux/kdebug.h>
51
52#include <asm/cacheflush.h>
53#include <asm/desc.h>
54#include <asm/pgtable.h>
55#include <asm/uaccess.h>
56#include <asm/alternative.h>
57
58void jprobe_return_end(void);
59
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62
63#ifdef CONFIG_X86_64
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
79 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
80 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
81 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
82 << (row % 32))
83 /*
84 * Undefined/reserved opcodes, conditional jump, Opcode Extension
85 * Groups, and some special opcodes can not boost.
86 */
87static const u32 twobyte_is_boostable[256 / 32] = {
88 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
89 /* ---------------------------------------------- */
90 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
91 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
92 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
93 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
94 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
95 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
96 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
97 W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
98 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
99 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
100 W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
101 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
102 W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
103 W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
104 W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
105 W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
106 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W
154
155struct kretprobe_blackpoint kretprobe_blacklist[] = {
156 {"__switch_to", }, /* This function switches only current task, but
157 doesn't switch kernel stack.*/
158 {NULL, NULL} /* Terminator */
159};
160const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
161
162/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
163static void __kprobes set_jmp_op(void *from, void *to)
164{
165 struct __arch_jmp_op {
166 char op;
167 s32 raddr;
168 } __attribute__((packed)) * jop;
169 jop = (struct __arch_jmp_op *)from;
170 jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
171 jop->op = RELATIVEJUMP_INSTRUCTION;
172}
173
174/*
175 * Check for the REX prefix which can only exist on X86_64
176 * X86_32 always returns 0
177 */
178static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
179{
180#ifdef CONFIG_X86_64
181 if ((*insn & 0xf0) == 0x40)
182 return 1;
183#endif
184 return 0;
185}
186
187/*
188 * Returns non-zero if opcode is boostable.
189 * RIP relative instructions are adjusted at copying time in 64 bits mode
190 */
191static int __kprobes can_boost(kprobe_opcode_t *opcodes)
192{
193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes;
195
196retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0;
199 opcode = *(opcodes++);
200
201 /* 2nd-byte opcode */
202 if (opcode == 0x0f) {
203 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
204 return 0;
205 return test_bit(*opcodes,
206 (unsigned long *)twobyte_is_boostable);
207 }
208
209 switch (opcode & 0xf0) {
210#ifdef CONFIG_X86_64
211 case 0x40:
212 goto retry; /* REX prefix is boostable */
213#endif
214 case 0x60:
215 if (0x63 < opcode && opcode < 0x67)
216 goto retry; /* prefixes */
217 /* can't boost Address-size override and bound */
218 return (opcode != 0x62 && opcode != 0x67);
219 case 0x70:
220 return 0; /* can't boost conditional jump */
221 case 0xc0:
222 /* can't boost software-interruptions */
223 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
224 case 0xd0:
225 /* can boost AA* and XLAT */
226 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
227 case 0xe0:
228 /* can boost in/out and absolute jmps */
229 return ((opcode & 0x04) || opcode == 0xea);
230 case 0xf0:
231 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
232 goto retry; /* lock/rep(ne) prefix */
233 /* clear and set flags are boostable */
234 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
235 default:
236 /* segment override prefixes are boostable */
237 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
238 goto retry; /* prefixes */
239 /* CS override prefix and call are not boostable */
240 return (opcode != 0x2e && opcode != 0x9a);
241 }
242}
243
244/*
245 * Returns non-zero if opcode modifies the interrupt flag.
246 */
247static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
248{
249 switch (*insn) {
250 case 0xfa: /* cli */
251 case 0xfb: /* sti */
252 case 0xcf: /* iret/iretd */
253 case 0x9d: /* popf/popfd */
254 return 1;
255 }
256
257 /*
258 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
259 * at the next byte instead.. but of course not recurse infinitely
260 */
261 if (is_REX_prefix(insn))
262 return is_IF_modifier(++insn);
263
264 return 0;
265}
266
267/*
268 * Adjust the displacement if the instruction uses the %rip-relative
269 * addressing mode.
270 * If it does, Return the address of the 32-bit displacement word.
271 * If not, return null.
272 * Only applicable to 64-bit x86.
273 */
274static void __kprobes fix_riprel(struct kprobe *p)
275{
276#ifdef CONFIG_X86_64
277 u8 *insn = p->ainsn.insn;
278 s64 disp;
279 int need_modrm;
280
281 /* Skip legacy instruction prefixes. */
282 while (1) {
283 switch (*insn) {
284 case 0x66:
285 case 0x67:
286 case 0x2e:
287 case 0x3e:
288 case 0x26:
289 case 0x64:
290 case 0x65:
291 case 0x36:
292 case 0xf0:
293 case 0xf3:
294 case 0xf2:
295 ++insn;
296 continue;
297 }
298 break;
299 }
300
301 /* Skip REX instruction prefix. */
302 if (is_REX_prefix(insn))
303 ++insn;
304
305 if (*insn == 0x0f) {
306 /* Two-byte opcode. */
307 ++insn;
308 need_modrm = test_bit(*insn,
309 (unsigned long *)twobyte_has_modrm);
310 } else
311 /* One-byte opcode. */
312 need_modrm = test_bit(*insn,
313 (unsigned long *)onebyte_has_modrm);
314
315 if (need_modrm) {
316 u8 modrm = *++insn;
317 if ((modrm & 0xc7) == 0x05) {
318 /* %rip+disp32 addressing mode */
319 /* Displacement follows ModRM byte. */
320 ++insn;
321 /*
322 * The copied instruction uses the %rip-relative
323 * addressing mode. Adjust the displacement for the
324 * difference between the original location of this
325 * instruction and the location of the copy that will
326 * actually be run. The tricky bit here is making sure
327 * that the sign extension happens correctly in this
328 * calculation, since we need a signed 32-bit result to
329 * be sign-extended to 64 bits when it's added to the
330 * %rip value and yield the same 64-bit result that the
331 * sign-extension of the original signed 32-bit
332 * displacement would have given.
333 */
334 disp = (u8 *) p->addr + *((s32 *) insn) -
335 (u8 *) p->ainsn.insn;
336 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
337 *(s32 *)insn = (s32) disp;
338 }
339 }
340#endif
341}
342
343static void __kprobes arch_copy_kprobe(struct kprobe *p)
344{
345 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
346
347 fix_riprel(p);
348
349 if (can_boost(p->addr))
350 p->ainsn.boostable = 0;
351 else
352 p->ainsn.boostable = -1;
353
354 p->opcode = *p->addr;
355}
356
357int __kprobes arch_prepare_kprobe(struct kprobe *p)
358{
359 /* insn: must be on special executable page on x86. */
360 p->ainsn.insn = get_insn_slot();
361 if (!p->ainsn.insn)
362 return -ENOMEM;
363 arch_copy_kprobe(p);
364 return 0;
365}
366
367void __kprobes arch_arm_kprobe(struct kprobe *p)
368{
369 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
370}
371
372void __kprobes arch_disarm_kprobe(struct kprobe *p)
373{
374 text_poke(p->addr, &p->opcode, 1);
375}
376
377void __kprobes arch_remove_kprobe(struct kprobe *p)
378{
379 mutex_lock(&kprobe_mutex);
380 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
381 mutex_unlock(&kprobe_mutex);
382}
383
384static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
385{
386 kcb->prev_kprobe.kp = kprobe_running();
387 kcb->prev_kprobe.status = kcb->kprobe_status;
388 kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
389 kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
390}
391
392static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
393{
394 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
395 kcb->kprobe_status = kcb->prev_kprobe.status;
396 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
397 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
398}
399
400static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
401 struct kprobe_ctlblk *kcb)
402{
403 __get_cpu_var(current_kprobe) = p;
404 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
405 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
406 if (is_IF_modifier(p->ainsn.insn))
407 kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
408}
409
410static void __kprobes clear_btf(void)
411{
412 if (test_thread_flag(TIF_DEBUGCTLMSR))
413 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
414}
415
416static void __kprobes restore_btf(void)
417{
418 if (test_thread_flag(TIF_DEBUGCTLMSR))
419 wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
420}
421
422static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
423{
424 clear_btf();
425 regs->flags |= X86_EFLAGS_TF;
426 regs->flags &= ~X86_EFLAGS_IF;
427 /* single step inline if the instruction is an int3 */
428 if (p->opcode == BREAKPOINT_INSTRUCTION)
429 regs->ip = (unsigned long)p->addr;
430 else
431 regs->ip = (unsigned long)p->ainsn.insn;
432}
433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs)
437{
438 unsigned long *sara = stack_addr(regs);
439
440 ri->ret_addr = (kprobe_opcode_t *) *sara;
441
442 /* Replace the return addr with trampoline addr */
443 *sara = (unsigned long) &kretprobe_trampoline;
444}
445
446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
447 struct kprobe_ctlblk *kcb)
448{
449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
450 if (p->ainsn.boostable == 1 && !p->post_handler) {
451 /* Boost up -- we can execute copied instructions directly */
452 reset_current_kprobe();
453 regs->ip = (unsigned long)p->ainsn.insn;
454 preempt_enable_no_resched();
455 return;
456 }
457#endif
458 prepare_singlestep(p, regs);
459 kcb->kprobe_status = KPROBE_HIT_SS;
460}
461
462/*
463 * We have reentered the kprobe_handler(), since another probe was hit while
464 * within the handler. We save the original kprobes variables and just single
465 * step on the instruction of the new probe without calling any user handlers.
466 */
467static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
468 struct kprobe_ctlblk *kcb)
469{
470 switch (kcb->kprobe_status) {
471 case KPROBE_HIT_SSDONE:
472#ifdef CONFIG_X86_64
473 /* TODO: Provide re-entrancy from post_kprobes_handler() and
474 * avoid exception stack corruption while single-stepping on
475 * the instruction of the new probe.
476 */
477 arch_disarm_kprobe(p);
478 regs->ip = (unsigned long)p->addr;
479 reset_current_kprobe();
480 preempt_enable_no_resched();
481 break;
482#endif
483 case KPROBE_HIT_ACTIVE:
484 save_previous_kprobe(kcb);
485 set_current_kprobe(p, regs, kcb);
486 kprobes_inc_nmissed_count(p);
487 prepare_singlestep(p, regs);
488 kcb->kprobe_status = KPROBE_REENTER;
489 break;
490 case KPROBE_HIT_SS:
491 if (p == kprobe_running()) {
492 regs->flags &= ~TF_MASK;
493 regs->flags |= kcb->kprobe_saved_flags;
494 return 0;
495 } else {
496 /* A probe has been hit in the codepath leading up
497 * to, or just after, single-stepping of a probed
498 * instruction. This entire codepath should strictly
499 * reside in .kprobes.text section. Raise a warning
500 * to highlight this peculiar case.
501 */
502 }
503 default:
504 /* impossible cases */
505 WARN_ON(1);
506 return 0;
507 }
508
509 return 1;
510}
511
512/*
513 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
514 * remain disabled thorough out this function.
515 */
516static int __kprobes kprobe_handler(struct pt_regs *regs)
517{
518 kprobe_opcode_t *addr;
519 struct kprobe *p;
520 struct kprobe_ctlblk *kcb;
521
522 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
523 if (*addr != BREAKPOINT_INSTRUCTION) {
524 /*
525 * The breakpoint instruction was removed right
526 * after we hit it. Another cpu has removed
527 * either a probepoint or a debugger breakpoint
528 * at this address. In either case, no further
529 * handling of this interrupt is appropriate.
530 * Back up over the (now missing) int3 and run
531 * the original instruction.
532 */
533 regs->ip = (unsigned long)addr;
534 return 1;
535 }
536
537 /*
538 * We don't want to be preempted for the entire
539 * duration of kprobe processing. We conditionally
540 * re-enable preemption at the end of this function,
541 * and also in reenter_kprobe() and setup_singlestep().
542 */
543 preempt_disable();
544
545 kcb = get_kprobe_ctlblk();
546 p = get_kprobe(addr);
547
548 if (p) {
549 if (kprobe_running()) {
550 if (reenter_kprobe(p, regs, kcb))
551 return 1;
552 } else {
553 set_current_kprobe(p, regs, kcb);
554 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
555
556 /*
557 * If we have no pre-handler or it returned 0, we
558 * continue with normal processing. If we have a
559 * pre-handler and it returned non-zero, it prepped
560 * for calling the break_handler below on re-entry
561 * for jprobe processing, so get out doing nothing
562 * more here.
563 */
564 if (!p->pre_handler || !p->pre_handler(p, regs))
565 setup_singlestep(p, regs, kcb);
566 return 1;
567 }
568 } else if (kprobe_running()) {
569 p = __get_cpu_var(current_kprobe);
570 if (p->break_handler && p->break_handler(p, regs)) {
571 setup_singlestep(p, regs, kcb);
572 return 1;
573 }
574 } /* else: not a kprobe fault; let the kernel handle it */
575
576 preempt_enable_no_resched();
577 return 0;
578}
579
580/*
581 * When a retprobed function returns, this code saves registers and
582 * calls trampoline_handler() runs, which calls the kretprobe's handler.
583 */
584void __kprobes kretprobe_trampoline_holder(void)
585{
586 asm volatile (
587 ".global kretprobe_trampoline\n"
588 "kretprobe_trampoline: \n"
589#ifdef CONFIG_X86_64
590 /* We don't bother saving the ss register */
591 " pushq %rsp\n"
592 " pushfq\n"
593 /*
594 * Skip cs, ip, orig_ax.
595 * trampoline_handler() will plug in these values
596 */
597 " subq $24, %rsp\n"
598 " pushq %rdi\n"
599 " pushq %rsi\n"
600 " pushq %rdx\n"
601 " pushq %rcx\n"
602 " pushq %rax\n"
603 " pushq %r8\n"
604 " pushq %r9\n"
605 " pushq %r10\n"
606 " pushq %r11\n"
607 " pushq %rbx\n"
608 " pushq %rbp\n"
609 " pushq %r12\n"
610 " pushq %r13\n"
611 " pushq %r14\n"
612 " pushq %r15\n"
613 " movq %rsp, %rdi\n"
614 " call trampoline_handler\n"
615 /* Replace saved sp with true return address. */
616 " movq %rax, 152(%rsp)\n"
617 " popq %r15\n"
618 " popq %r14\n"
619 " popq %r13\n"
620 " popq %r12\n"
621 " popq %rbp\n"
622 " popq %rbx\n"
623 " popq %r11\n"
624 " popq %r10\n"
625 " popq %r9\n"
626 " popq %r8\n"
627 " popq %rax\n"
628 " popq %rcx\n"
629 " popq %rdx\n"
630 " popq %rsi\n"
631 " popq %rdi\n"
632 /* Skip orig_ax, ip, cs */
633 " addq $24, %rsp\n"
634 " popfq\n"
635#else
636 " pushf\n"
637 /*
638 * Skip cs, ip, orig_ax.
639 * trampoline_handler() will plug in these values
640 */
641 " subl $12, %esp\n"
642 " pushl %fs\n"
643 " pushl %ds\n"
644 " pushl %es\n"
645 " pushl %eax\n"
646 " pushl %ebp\n"
647 " pushl %edi\n"
648 " pushl %esi\n"
649 " pushl %edx\n"
650 " pushl %ecx\n"
651 " pushl %ebx\n"
652 " movl %esp, %eax\n"
653 " call trampoline_handler\n"
654 /* Move flags to cs */
655 " movl 52(%esp), %edx\n"
656 " movl %edx, 48(%esp)\n"
657 /* Replace saved flags with true return address. */
658 " movl %eax, 52(%esp)\n"
659 " popl %ebx\n"
660 " popl %ecx\n"
661 " popl %edx\n"
662 " popl %esi\n"
663 " popl %edi\n"
664 " popl %ebp\n"
665 " popl %eax\n"
666 /* Skip ip, orig_ax, es, ds, fs */
667 " addl $20, %esp\n"
668 " popf\n"
669#endif
670 " ret\n");
671}
672
673/*
674 * Called from kretprobe_trampoline
675 */
676void * __kprobes trampoline_handler(struct pt_regs *regs)
677{
678 struct kretprobe_instance *ri = NULL;
679 struct hlist_head *head, empty_rp;
680 struct hlist_node *node, *tmp;
681 unsigned long flags, orig_ret_address = 0;
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683
684 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */
688#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS;
690#else
691 regs->cs = __KERNEL_CS | get_kernel_rpl();
692#endif
693 regs->ip = trampoline_address;
694 regs->orig_ax = ~0UL;
695
696 /*
697 * It is possible to have multiple instances associated with a given
698 * task either because multiple functions in the call path have
699 * return probes installed on them, and/or more then one
700 * return probe was registered for a target function.
701 *
702 * We can handle this because:
703 * - instances are always pushed into the head of the list
704 * - when multiple return probes are registered for the same
705 * function, the (chronologically) first instance's ret_addr
706 * will be the real return address, and all the rest will
707 * point to kretprobe_trampoline.
708 */
709 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
710 if (ri->task != current)
711 /* another task is sharing our hash bucket */
712 continue;
713
714 if (ri->rp && ri->rp->handler) {
715 __get_cpu_var(current_kprobe) = &ri->rp->kp;
716 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
717 ri->rp->handler(ri, regs);
718 __get_cpu_var(current_kprobe) = NULL;
719 }
720
721 orig_ret_address = (unsigned long)ri->ret_addr;
722 recycle_rp_inst(ri, &empty_rp);
723
724 if (orig_ret_address != trampoline_address)
725 /*
726 * This is the real return address. Any other
727 * instances associated with this task are for
728 * other calls deeper on the call stack
729 */
730 break;
731 }
732
733 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734
735 spin_unlock_irqrestore(&kretprobe_lock, flags);
736
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist);
739 kfree(ri);
740 }
741 return (void *)orig_ret_address;
742}
743
744/*
745 * Called after single-stepping. p->addr is the address of the
746 * instruction whose first byte has been replaced by the "int 3"
747 * instruction. To avoid the SMP problems that can occur when we
748 * temporarily put back the original opcode to single-step, we
749 * single-stepped a copy of the instruction. The address of this
750 * copy is p->ainsn.insn.
751 *
752 * This function prepares to return from the post-single-step
753 * interrupt. We have to fix up the stack as follows:
754 *
755 * 0) Except in the case of absolute or indirect jump or call instructions,
756 * the new ip is relative to the copied instruction. We need to make
757 * it relative to the original instruction.
758 *
759 * 1) If the single-stepped instruction was pushfl, then the TF and IF
760 * flags are set in the just-pushed flags, and may need to be cleared.
761 *
762 * 2) If the single-stepped instruction was a call, the return address
763 * that is atop the stack is the address following the copied instruction.
764 * We need to make it the address following the original instruction.
765 *
766 * If this is the first time we've single-stepped the instruction at
767 * this probepoint, and the instruction is boostable, boost it: add a
768 * jump instruction after the copied instruction, that jumps to the next
769 * instruction after the probepoint.
770 */
771static void __kprobes resume_execution(struct kprobe *p,
772 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
773{
774 unsigned long *tos = stack_addr(regs);
775 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
776 unsigned long orig_ip = (unsigned long)p->addr;
777 kprobe_opcode_t *insn = p->ainsn.insn;
778
779 /*skip the REX prefix*/
780 if (is_REX_prefix(insn))
781 insn++;
782
783 regs->flags &= ~X86_EFLAGS_TF;
784 switch (*insn) {
785 case 0x9c: /* pushfl */
786 *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
787 *tos |= kcb->kprobe_old_flags;
788 break;
789 case 0xc2: /* iret/ret/lret */
790 case 0xc3:
791 case 0xca:
792 case 0xcb:
793 case 0xcf:
794 case 0xea: /* jmp absolute -- ip is correct */
795 /* ip is already adjusted, no more changes required */
796 p->ainsn.boostable = 1;
797 goto no_change;
798 case 0xe8: /* call relative - Fix return addr */
799 *tos = orig_ip + (*tos - copy_ip);
800 break;
801#ifdef CONFIG_X86_32
802 case 0x9a: /* call absolute -- same as call absolute, indirect */
803 *tos = orig_ip + (*tos - copy_ip);
804 goto no_change;
805#endif
806 case 0xff:
807 if ((insn[1] & 0x30) == 0x10) {
808 /*
809 * call absolute, indirect
810 * Fix return addr; ip is correct.
811 * But this is not boostable
812 */
813 *tos = orig_ip + (*tos - copy_ip);
814 goto no_change;
815 } else if (((insn[1] & 0x31) == 0x20) ||
816 ((insn[1] & 0x31) == 0x21)) {
817 /*
818 * jmp near and far, absolute indirect
819 * ip is correct. And this is boostable
820 */
821 p->ainsn.boostable = 1;
822 goto no_change;
823 }
824 default:
825 break;
826 }
827
828 if (p->ainsn.boostable == 0) {
829 if ((regs->ip > copy_ip) &&
830 (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
831 /*
832 * These instructions can be executed directly if it
833 * jumps back to correct address.
834 */
835 set_jmp_op((void *)regs->ip,
836 (void *)orig_ip + (regs->ip - copy_ip));
837 p->ainsn.boostable = 1;
838 } else {
839 p->ainsn.boostable = -1;
840 }
841 }
842
843 regs->ip += orig_ip - copy_ip;
844
845no_change:
846 restore_btf();
847}
848
849/*
850 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
851 * remain disabled thoroughout this function.
852 */
853static int __kprobes post_kprobe_handler(struct pt_regs *regs)
854{
855 struct kprobe *cur = kprobe_running();
856 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
857
858 if (!cur)
859 return 0;
860
861 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
862 kcb->kprobe_status = KPROBE_HIT_SSDONE;
863 cur->post_handler(cur, regs, 0);
864 }
865
866 resume_execution(cur, regs, kcb);
867 regs->flags |= kcb->kprobe_saved_flags;
868 trace_hardirqs_fixup_flags(regs->flags);
869
870 /* Restore back the original saved kprobes variables and continue. */
871 if (kcb->kprobe_status == KPROBE_REENTER) {
872 restore_previous_kprobe(kcb);
873 goto out;
874 }
875 reset_current_kprobe();
876out:
877 preempt_enable_no_resched();
878
879 /*
880 * if somebody else is singlestepping across a probe point, flags
881 * will have TF set, in which case, continue the remaining processing
882 * of do_debug, as if this is not a probe hit.
883 */
884 if (regs->flags & X86_EFLAGS_TF)
885 return 0;
886
887 return 1;
888}
889
890int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
891{
892 struct kprobe *cur = kprobe_running();
893 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
894
895 switch (kcb->kprobe_status) {
896 case KPROBE_HIT_SS:
897 case KPROBE_REENTER:
898 /*
899 * We are here because the instruction being single
900 * stepped caused a page fault. We reset the current
901 * kprobe and the ip points back to the probe address
902 * and allow the page fault handler to continue as a
903 * normal page fault.
904 */
905 regs->ip = (unsigned long)cur->addr;
906 regs->flags |= kcb->kprobe_old_flags;
907 if (kcb->kprobe_status == KPROBE_REENTER)
908 restore_previous_kprobe(kcb);
909 else
910 reset_current_kprobe();
911 preempt_enable_no_resched();
912 break;
913 case KPROBE_HIT_ACTIVE:
914 case KPROBE_HIT_SSDONE:
915 /*
916 * We increment the nmissed count for accounting,
917 * we can also use npre/npostfault count for accounting
918 * these specific fault cases.
919 */
920 kprobes_inc_nmissed_count(cur);
921
922 /*
923 * We come here because instructions in the pre/post
924 * handler caused the page_fault, this could happen
925 * if handler tries to access user space by
926 * copy_from_user(), get_user() etc. Let the
927 * user-specified handler try to fix it first.
928 */
929 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
930 return 1;
931
932 /*
933 * In case the user-specified fault handler returned
934 * zero, try to fix up.
935 */
936 if (fixup_exception(regs))
937 return 1;
938
939 /*
940 * fixup routine could not handle it,
941 * Let do_page_fault() fix it.
942 */
943 break;
944 default:
945 break;
946 }
947 return 0;
948}
949
950/*
951 * Wrapper routine for handling exceptions.
952 */
953int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
954 unsigned long val, void *data)
955{
956 struct die_args *args = data;
957 int ret = NOTIFY_DONE;
958
959 if (args->regs && user_mode_vm(args->regs))
960 return ret;
961
962 switch (val) {
963 case DIE_INT3:
964 if (kprobe_handler(args->regs))
965 ret = NOTIFY_STOP;
966 break;
967 case DIE_DEBUG:
968 if (post_kprobe_handler(args->regs))
969 ret = NOTIFY_STOP;
970 break;
971 case DIE_GPF:
972 /*
973 * To be potentially processing a kprobe fault and to
974 * trust the result from kprobe_running(), we have
975 * be non-preemptible.
976 */
977 if (!preemptible() && kprobe_running() &&
978 kprobe_fault_handler(args->regs, args->trapnr))
979 ret = NOTIFY_STOP;
980 break;
981 default:
982 break;
983 }
984 return ret;
985}
986
987int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
988{
989 struct jprobe *jp = container_of(p, struct jprobe, kp);
990 unsigned long addr;
991 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
992
993 kcb->jprobe_saved_regs = *regs;
994 kcb->jprobe_saved_sp = stack_addr(regs);
995 addr = (unsigned long)(kcb->jprobe_saved_sp);
996
997 /*
998 * As Linus pointed out, gcc assumes that the callee
999 * owns the argument space and could overwrite it, e.g.
1000 * tailcall optimization. So, to be absolutely safe
1001 * we also save and restore enough stack bytes to cover
1002 * the argument area.
1003 */
1004 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
1005 MIN_STACK_SIZE(addr));
1006 regs->flags &= ~X86_EFLAGS_IF;
1007 trace_hardirqs_off();
1008 regs->ip = (unsigned long)(jp->entry);
1009 return 1;
1010}
1011
1012void __kprobes jprobe_return(void)
1013{
1014 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1015
1016 asm volatile (
1017#ifdef CONFIG_X86_64
1018 " xchg %%rbx,%%rsp \n"
1019#else
1020 " xchgl %%ebx,%%esp \n"
1021#endif
1022 " int3 \n"
1023 " .globl jprobe_return_end\n"
1024 " jprobe_return_end: \n"
1025 " nop \n"::"b"
1026 (kcb->jprobe_saved_sp):"memory");
1027}
1028
1029int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030{
1031 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1032 u8 *addr = (u8 *) (regs->ip - 1);
1033 struct jprobe *jp = container_of(p, struct jprobe, kp);
1034
1035 if ((addr > (u8 *) jprobe_return) &&
1036 (addr < (u8 *) jprobe_return_end)) {
1037 if (stack_addr(regs) != kcb->jprobe_saved_sp) {
1038 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
1039 printk(KERN_ERR
1040 "current sp %p does not match saved sp %p\n",
1041 stack_addr(regs), kcb->jprobe_saved_sp);
1042 printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1043 show_registers(saved_regs);
1044 printk(KERN_ERR "Current registers\n");
1045 show_registers(regs);
1046 BUG();
1047 }
1048 *regs = kcb->jprobe_saved_regs;
1049 memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
1050 kcb->jprobes_stack,
1051 MIN_STACK_SIZE(kcb->jprobe_saved_sp));
1052 preempt_enable_no_resched();
1053 return 1;
1054 }
1055 return 0;
1056}
1057
1058int __init arch_init_kprobes(void)
1059{
1060 return 0;
1061}
1062
1063int __kprobes arch_trampoline_kprobe(struct kprobe *p)
1064{
1065 return 0;
1066}
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
deleted file mode 100644
index 3a020f79f82b..000000000000
--- a/arch/x86/kernel/kprobes_32.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
26 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> added function-return probes.
28 */
29
30#include <linux/kprobes.h>
31#include <linux/ptrace.h>
32#include <linux/preempt.h>
33#include <linux/kdebug.h>
34#include <asm/cacheflush.h>
35#include <asm/desc.h>
36#include <asm/uaccess.h>
37#include <asm/alternative.h>
38
39void jprobe_return_end(void);
40
41DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
42DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
43
44struct kretprobe_blackpoint kretprobe_blacklist[] = {
45 {"__switch_to", }, /* This function switches only current task, but
46 doesn't switch kernel stack.*/
47 {NULL, NULL} /* Terminator */
48};
49const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
50
51/* insert a jmp code */
52static __always_inline void set_jmp_op(void *from, void *to)
53{
54 struct __arch_jmp_op {
55 char op;
56 long raddr;
57 } __attribute__((packed)) *jop;
58 jop = (struct __arch_jmp_op *)from;
59 jop->raddr = (long)(to) - ((long)(from) + 5);
60 jop->op = RELATIVEJUMP_INSTRUCTION;
61}
62
63/*
64 * returns non-zero if opcodes can be boosted.
65 */
66static __always_inline int can_boost(kprobe_opcode_t *opcodes)
67{
68#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
70 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
71 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
72 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
73 << (row % 32))
74 /*
75 * Undefined/reserved opcodes, conditional jump, Opcode Extension
76 * Groups, and some special opcodes can not be boost.
77 */
78 static const unsigned long twobyte_is_boostable[256 / 32] = {
79 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
80 /* ------------------------------- */
81 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
82 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
83 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
84 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
85 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
86 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
87 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
88 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
89 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
90 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
91 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
92 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
93 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
94 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
95 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
96 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
97 /* ------------------------------- */
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 };
100#undef W
101 kprobe_opcode_t opcode;
102 kprobe_opcode_t *orig_opcodes = opcodes;
103retry:
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 opcode = *(opcodes++);
107
108 /* 2nd-byte opcode */
109 if (opcode == 0x0f) {
110 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
111 return 0;
112 return test_bit(*opcodes, twobyte_is_boostable);
113 }
114
115 switch (opcode & 0xf0) {
116 case 0x60:
117 if (0x63 < opcode && opcode < 0x67)
118 goto retry; /* prefixes */
119 /* can't boost Address-size override and bound */
120 return (opcode != 0x62 && opcode != 0x67);
121 case 0x70:
122 return 0; /* can't boost conditional jump */
123 case 0xc0:
124 /* can't boost software-interruptions */
125 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
126 case 0xd0:
127 /* can boost AA* and XLAT */
128 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
129 case 0xe0:
130 /* can boost in/out and absolute jmps */
131 return ((opcode & 0x04) || opcode == 0xea);
132 case 0xf0:
133 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
134 goto retry; /* lock/rep(ne) prefix */
135 /* clear and set flags can be boost */
136 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
137 default:
138 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
139 goto retry; /* prefixes */
140 /* can't boost CS override and call */
141 return (opcode != 0x2e && opcode != 0x9a);
142 }
143}
144
145/*
146 * returns non-zero if opcode modifies the interrupt flag.
147 */
148static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
149{
150 switch (opcode) {
151 case 0xfa: /* cli */
152 case 0xfb: /* sti */
153 case 0xcf: /* iret/iretd */
154 case 0x9d: /* popf/popfd */
155 return 1;
156 }
157 return 0;
158}
159
160int __kprobes arch_prepare_kprobe(struct kprobe *p)
161{
162 /* insn: must be on special executable page on i386. */
163 p->ainsn.insn = get_insn_slot();
164 if (!p->ainsn.insn)
165 return -ENOMEM;
166
167 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
168 p->opcode = *p->addr;
169 if (can_boost(p->addr)) {
170 p->ainsn.boostable = 0;
171 } else {
172 p->ainsn.boostable = -1;
173 }
174 return 0;
175}
176
177void __kprobes arch_arm_kprobe(struct kprobe *p)
178{
179 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
180}
181
182void __kprobes arch_disarm_kprobe(struct kprobe *p)
183{
184 text_poke(p->addr, &p->opcode, 1);
185}
186
187void __kprobes arch_remove_kprobe(struct kprobe *p)
188{
189 mutex_lock(&kprobe_mutex);
190 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
191 mutex_unlock(&kprobe_mutex);
192}
193
194static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
195{
196 kcb->prev_kprobe.kp = kprobe_running();
197 kcb->prev_kprobe.status = kcb->kprobe_status;
198 kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
199 kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
200}
201
202static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
203{
204 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
205 kcb->kprobe_status = kcb->prev_kprobe.status;
206 kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
207 kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
208}
209
210static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
211 struct kprobe_ctlblk *kcb)
212{
213 __get_cpu_var(current_kprobe) = p;
214 kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
215 = (regs->eflags & (TF_MASK | IF_MASK));
216 if (is_IF_modifier(p->opcode))
217 kcb->kprobe_saved_eflags &= ~IF_MASK;
218}
219
220static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
221{
222 regs->eflags |= TF_MASK;
223 regs->eflags &= ~IF_MASK;
224 /*single step inline if the instruction is an int3*/
225 if (p->opcode == BREAKPOINT_INSTRUCTION)
226 regs->eip = (unsigned long)p->addr;
227 else
228 regs->eip = (unsigned long)p->ainsn.insn;
229}
230
231/* Called with kretprobe_lock held */
232void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
233 struct pt_regs *regs)
234{
235 unsigned long *sara = (unsigned long *)&regs->esp;
236
237 ri->ret_addr = (kprobe_opcode_t *) *sara;
238
239 /* Replace the return addr with trampoline addr */
240 *sara = (unsigned long) &kretprobe_trampoline;
241}
242
243/*
244 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
245 * remain disabled thorough out this function.
246 */
247static int __kprobes kprobe_handler(struct pt_regs *regs)
248{
249 struct kprobe *p;
250 int ret = 0;
251 kprobe_opcode_t *addr;
252 struct kprobe_ctlblk *kcb;
253
254 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
255
256 /*
257 * We don't want to be preempted for the entire
258 * duration of kprobe processing
259 */
260 preempt_disable();
261 kcb = get_kprobe_ctlblk();
262
263 /* Check we're not actually recursing */
264 if (kprobe_running()) {
265 p = get_kprobe(addr);
266 if (p) {
267 if (kcb->kprobe_status == KPROBE_HIT_SS &&
268 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
269 regs->eflags &= ~TF_MASK;
270 regs->eflags |= kcb->kprobe_saved_eflags;
271 goto no_kprobe;
272 }
273 /* We have reentered the kprobe_handler(), since
274 * another probe was hit while within the handler.
275 * We here save the original kprobes variables and
276 * just single step on the instruction of the new probe
277 * without calling any user handlers.
278 */
279 save_previous_kprobe(kcb);
280 set_current_kprobe(p, regs, kcb);
281 kprobes_inc_nmissed_count(p);
282 prepare_singlestep(p, regs);
283 kcb->kprobe_status = KPROBE_REENTER;
284 return 1;
285 } else {
286 if (*addr != BREAKPOINT_INSTRUCTION) {
287 /* The breakpoint instruction was removed by
288 * another cpu right after we hit, no further
289 * handling of this interrupt is appropriate
290 */
291 regs->eip -= sizeof(kprobe_opcode_t);
292 ret = 1;
293 goto no_kprobe;
294 }
295 p = __get_cpu_var(current_kprobe);
296 if (p->break_handler && p->break_handler(p, regs)) {
297 goto ss_probe;
298 }
299 }
300 goto no_kprobe;
301 }
302
303 p = get_kprobe(addr);
304 if (!p) {
305 if (*addr != BREAKPOINT_INSTRUCTION) {
306 /*
307 * The breakpoint instruction was removed right
308 * after we hit it. Another cpu has removed
309 * either a probepoint or a debugger breakpoint
310 * at this address. In either case, no further
311 * handling of this interrupt is appropriate.
312 * Back up over the (now missing) int3 and run
313 * the original instruction.
314 */
315 regs->eip -= sizeof(kprobe_opcode_t);
316 ret = 1;
317 }
318 /* Not one of ours: let kernel handle it */
319 goto no_kprobe;
320 }
321
322 set_current_kprobe(p, regs, kcb);
323 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
324
325 if (p->pre_handler && p->pre_handler(p, regs))
326 /* handler has already set things up, so skip ss setup */
327 return 1;
328
329ss_probe:
330#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
331 if (p->ainsn.boostable == 1 && !p->post_handler){
332 /* Boost up -- we can execute copied instructions directly */
333 reset_current_kprobe();
334 regs->eip = (unsigned long)p->ainsn.insn;
335 preempt_enable_no_resched();
336 return 1;
337 }
338#endif
339 prepare_singlestep(p, regs);
340 kcb->kprobe_status = KPROBE_HIT_SS;
341 return 1;
342
343no_kprobe:
344 preempt_enable_no_resched();
345 return ret;
346}
347
348/*
349 * For function-return probes, init_kprobes() establishes a probepoint
350 * here. When a retprobed function returns, this probe is hit and
351 * trampoline_probe_handler() runs, calling the kretprobe's handler.
352 */
353 void __kprobes kretprobe_trampoline_holder(void)
354 {
355 asm volatile ( ".global kretprobe_trampoline\n"
356 "kretprobe_trampoline: \n"
357 " pushf\n"
358 /* skip cs, eip, orig_eax */
359 " subl $12, %esp\n"
360 " pushl %fs\n"
361 " pushl %ds\n"
362 " pushl %es\n"
363 " pushl %eax\n"
364 " pushl %ebp\n"
365 " pushl %edi\n"
366 " pushl %esi\n"
367 " pushl %edx\n"
368 " pushl %ecx\n"
369 " pushl %ebx\n"
370 " movl %esp, %eax\n"
371 " call trampoline_handler\n"
372 /* move eflags to cs */
373 " movl 52(%esp), %edx\n"
374 " movl %edx, 48(%esp)\n"
375 /* save true return address on eflags */
376 " movl %eax, 52(%esp)\n"
377 " popl %ebx\n"
378 " popl %ecx\n"
379 " popl %edx\n"
380 " popl %esi\n"
381 " popl %edi\n"
382 " popl %ebp\n"
383 " popl %eax\n"
384 /* skip eip, orig_eax, es, ds, fs */
385 " addl $20, %esp\n"
386 " popf\n"
387 " ret\n");
388}
389
390/*
391 * Called from kretprobe_trampoline
392 */
393fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
394{
395 struct kretprobe_instance *ri = NULL;
396 struct hlist_head *head, empty_rp;
397 struct hlist_node *node, *tmp;
398 unsigned long flags, orig_ret_address = 0;
399 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
400
401 INIT_HLIST_HEAD(&empty_rp);
402 spin_lock_irqsave(&kretprobe_lock, flags);
403 head = kretprobe_inst_table_head(current);
404 /* fixup registers */
405 regs->xcs = __KERNEL_CS | get_kernel_rpl();
406 regs->eip = trampoline_address;
407 regs->orig_eax = 0xffffffff;
408
409 /*
410 * It is possible to have multiple instances associated with a given
411 * task either because an multiple functions in the call path
412 * have a return probe installed on them, and/or more then one return
413 * return probe was registered for a target function.
414 *
415 * We can handle this because:
416 * - instances are always inserted at the head of the list
417 * - when multiple return probes are registered for the same
418 * function, the first instance's ret_addr will point to the
419 * real return address, and all the rest will point to
420 * kretprobe_trampoline
421 */
422 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
423 if (ri->task != current)
424 /* another task is sharing our hash bucket */
425 continue;
426
427 if (ri->rp && ri->rp->handler){
428 __get_cpu_var(current_kprobe) = &ri->rp->kp;
429 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
430 ri->rp->handler(ri, regs);
431 __get_cpu_var(current_kprobe) = NULL;
432 }
433
434 orig_ret_address = (unsigned long)ri->ret_addr;
435 recycle_rp_inst(ri, &empty_rp);
436
437 if (orig_ret_address != trampoline_address)
438 /*
439 * This is the real return address. Any other
440 * instances associated with this task are for
441 * other calls deeper on the call stack
442 */
443 break;
444 }
445
446 kretprobe_assert(ri, orig_ret_address, trampoline_address);
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448
449 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
450 hlist_del(&ri->hlist);
451 kfree(ri);
452 }
453 return (void*)orig_ret_address;
454}
455
456/*
457 * Called after single-stepping. p->addr is the address of the
458 * instruction whose first byte has been replaced by the "int 3"
459 * instruction. To avoid the SMP problems that can occur when we
460 * temporarily put back the original opcode to single-step, we
461 * single-stepped a copy of the instruction. The address of this
462 * copy is p->ainsn.insn.
463 *
464 * This function prepares to return from the post-single-step
465 * interrupt. We have to fix up the stack as follows:
466 *
467 * 0) Except in the case of absolute or indirect jump or call instructions,
468 * the new eip is relative to the copied instruction. We need to make
469 * it relative to the original instruction.
470 *
471 * 1) If the single-stepped instruction was pushfl, then the TF and IF
472 * flags are set in the just-pushed eflags, and may need to be cleared.
473 *
474 * 2) If the single-stepped instruction was a call, the return address
475 * that is atop the stack is the address following the copied instruction.
476 * We need to make it the address following the original instruction.
477 *
478 * This function also checks instruction size for preparing direct execution.
479 */
480static void __kprobes resume_execution(struct kprobe *p,
481 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
482{
483 unsigned long *tos = (unsigned long *)&regs->esp;
484 unsigned long copy_eip = (unsigned long)p->ainsn.insn;
485 unsigned long orig_eip = (unsigned long)p->addr;
486
487 regs->eflags &= ~TF_MASK;
488 switch (p->ainsn.insn[0]) {
489 case 0x9c: /* pushfl */
490 *tos &= ~(TF_MASK | IF_MASK);
491 *tos |= kcb->kprobe_old_eflags;
492 break;
493 case 0xc2: /* iret/ret/lret */
494 case 0xc3:
495 case 0xca:
496 case 0xcb:
497 case 0xcf:
498 case 0xea: /* jmp absolute -- eip is correct */
499 /* eip is already adjusted, no more changes required */
500 p->ainsn.boostable = 1;
501 goto no_change;
502 case 0xe8: /* call relative - Fix return addr */
503 *tos = orig_eip + (*tos - copy_eip);
504 break;
505 case 0x9a: /* call absolute -- same as call absolute, indirect */
506 *tos = orig_eip + (*tos - copy_eip);
507 goto no_change;
508 case 0xff:
509 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
510 /*
511 * call absolute, indirect
512 * Fix return addr; eip is correct.
513 * But this is not boostable
514 */
515 *tos = orig_eip + (*tos - copy_eip);
516 goto no_change;
517 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
518 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
519 /* eip is correct. And this is boostable */
520 p->ainsn.boostable = 1;
521 goto no_change;
522 }
523 default:
524 break;
525 }
526
527 if (p->ainsn.boostable == 0) {
528 if ((regs->eip > copy_eip) &&
529 (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
530 /*
531 * These instructions can be executed directly if it
532 * jumps back to correct address.
533 */
534 set_jmp_op((void *)regs->eip,
535 (void *)orig_eip + (regs->eip - copy_eip));
536 p->ainsn.boostable = 1;
537 } else {
538 p->ainsn.boostable = -1;
539 }
540 }
541
542 regs->eip = orig_eip + (regs->eip - copy_eip);
543
544no_change:
545 return;
546}
547
548/*
549 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
550 * remain disabled thoroughout this function.
551 */
552static int __kprobes post_kprobe_handler(struct pt_regs *regs)
553{
554 struct kprobe *cur = kprobe_running();
555 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
556
557 if (!cur)
558 return 0;
559
560 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
561 kcb->kprobe_status = KPROBE_HIT_SSDONE;
562 cur->post_handler(cur, regs, 0);
563 }
564
565 resume_execution(cur, regs, kcb);
566 regs->eflags |= kcb->kprobe_saved_eflags;
567 trace_hardirqs_fixup_flags(regs->eflags);
568
569 /*Restore back the original saved kprobes variables and continue. */
570 if (kcb->kprobe_status == KPROBE_REENTER) {
571 restore_previous_kprobe(kcb);
572 goto out;
573 }
574 reset_current_kprobe();
575out:
576 preempt_enable_no_resched();
577
578 /*
579 * if somebody else is singlestepping across a probe point, eflags
580 * will have TF set, in which case, continue the remaining processing
581 * of do_debug, as if this is not a probe hit.
582 */
583 if (regs->eflags & TF_MASK)
584 return 0;
585
586 return 1;
587}
588
589int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
590{
591 struct kprobe *cur = kprobe_running();
592 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
593
594 switch(kcb->kprobe_status) {
595 case KPROBE_HIT_SS:
596 case KPROBE_REENTER:
597 /*
598 * We are here because the instruction being single
599 * stepped caused a page fault. We reset the current
600 * kprobe and the eip points back to the probe address
601 * and allow the page fault handler to continue as a
602 * normal page fault.
603 */
604 regs->eip = (unsigned long)cur->addr;
605 regs->eflags |= kcb->kprobe_old_eflags;
606 if (kcb->kprobe_status == KPROBE_REENTER)
607 restore_previous_kprobe(kcb);
608 else
609 reset_current_kprobe();
610 preempt_enable_no_resched();
611 break;
612 case KPROBE_HIT_ACTIVE:
613 case KPROBE_HIT_SSDONE:
614 /*
615 * We increment the nmissed count for accounting,
616 * we can also use npre/npostfault count for accouting
617 * these specific fault cases.
618 */
619 kprobes_inc_nmissed_count(cur);
620
621 /*
622 * We come here because instructions in the pre/post
623 * handler caused the page_fault, this could happen
624 * if handler tries to access user space by
625 * copy_from_user(), get_user() etc. Let the
626 * user-specified handler try to fix it first.
627 */
628 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
629 return 1;
630
631 /*
632 * In case the user-specified fault handler returned
633 * zero, try to fix up.
634 */
635 if (fixup_exception(regs))
636 return 1;
637
638 /*
639 * fixup_exception() could not handle it,
640 * Let do_page_fault() fix it.
641 */
642 break;
643 default:
644 break;
645 }
646 return 0;
647}
648
649/*
650 * Wrapper routine to for handling exceptions.
651 */
652int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
653 unsigned long val, void *data)
654{
655 struct die_args *args = (struct die_args *)data;
656 int ret = NOTIFY_DONE;
657
658 if (args->regs && user_mode_vm(args->regs))
659 return ret;
660
661 switch (val) {
662 case DIE_INT3:
663 if (kprobe_handler(args->regs))
664 ret = NOTIFY_STOP;
665 break;
666 case DIE_DEBUG:
667 if (post_kprobe_handler(args->regs))
668 ret = NOTIFY_STOP;
669 break;
670 case DIE_GPF:
671 /* kprobe_running() needs smp_processor_id() */
672 preempt_disable();
673 if (kprobe_running() &&
674 kprobe_fault_handler(args->regs, args->trapnr))
675 ret = NOTIFY_STOP;
676 preempt_enable();
677 break;
678 default:
679 break;
680 }
681 return ret;
682}
683
684int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
685{
686 struct jprobe *jp = container_of(p, struct jprobe, kp);
687 unsigned long addr;
688 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
689
690 kcb->jprobe_saved_regs = *regs;
691 kcb->jprobe_saved_esp = &regs->esp;
692 addr = (unsigned long)(kcb->jprobe_saved_esp);
693
694 /*
695 * TBD: As Linus pointed out, gcc assumes that the callee
696 * owns the argument space and could overwrite it, e.g.
697 * tailcall optimization. So, to be absolutely safe
698 * we also save and restore enough stack bytes to cover
699 * the argument area.
700 */
701 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
702 MIN_STACK_SIZE(addr));
703 regs->eflags &= ~IF_MASK;
704 trace_hardirqs_off();
705 regs->eip = (unsigned long)(jp->entry);
706 return 1;
707}
708
709void __kprobes jprobe_return(void)
710{
711 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
712
713 asm volatile (" xchgl %%ebx,%%esp \n"
714 " int3 \n"
715 " .globl jprobe_return_end \n"
716 " jprobe_return_end: \n"
717 " nop \n"::"b"
718 (kcb->jprobe_saved_esp):"memory");
719}
720
721int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
722{
723 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
724 u8 *addr = (u8 *) (regs->eip - 1);
725 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
726 struct jprobe *jp = container_of(p, struct jprobe, kp);
727
728 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
729 if (&regs->esp != kcb->jprobe_saved_esp) {
730 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
731 printk("current esp %p does not match saved esp %p\n",
732 &regs->esp, kcb->jprobe_saved_esp);
733 printk("Saved registers for jprobe %p\n", jp);
734 show_registers(saved_regs);
735 printk("Current registers\n");
736 show_registers(regs);
737 BUG();
738 }
739 *regs = kcb->jprobe_saved_regs;
740 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
741 MIN_STACK_SIZE(stack_addr));
742 preempt_enable_no_resched();
743 return 1;
744 }
745 return 0;
746}
747
748int __kprobes arch_trampoline_kprobe(struct kprobe *p)
749{
750 return 0;
751}
752
753int __init arch_init_kprobes(void)
754{
755 return 0;
756}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
deleted file mode 100644
index 5df19a9f9239..000000000000
--- a/arch/x86/kernel/kprobes_64.c
+++ /dev/null
@@ -1,749 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
30 * Added function return probes functionality
31 */
32
33#include <linux/kprobes.h>
34#include <linux/ptrace.h>
35#include <linux/string.h>
36#include <linux/slab.h>
37#include <linux/preempt.h>
38#include <linux/module.h>
39#include <linux/kdebug.h>
40
41#include <asm/pgtable.h>
42#include <asm/uaccess.h>
43#include <asm/alternative.h>
44
45void jprobe_return_end(void);
46static void __kprobes arch_copy_kprobe(struct kprobe *p);
47
48DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
49DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
50
51struct kretprobe_blackpoint kretprobe_blacklist[] = {
52 {"__switch_to", }, /* This function switches only current task, but
53 doesn't switch kernel stack.*/
54 {NULL, NULL} /* Terminator */
55};
56const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
57
58/*
59 * returns non-zero if opcode modifies the interrupt flag.
60 */
61static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
62{
63 switch (*insn) {
64 case 0xfa: /* cli */
65 case 0xfb: /* sti */
66 case 0xcf: /* iret/iretd */
67 case 0x9d: /* popf/popfd */
68 return 1;
69 }
70
71 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
72 return 1;
73 return 0;
74}
75
76int __kprobes arch_prepare_kprobe(struct kprobe *p)
77{
78 /* insn: must be on special executable page on x86_64. */
79 p->ainsn.insn = get_insn_slot();
80 if (!p->ainsn.insn) {
81 return -ENOMEM;
82 }
83 arch_copy_kprobe(p);
84 return 0;
85}
86
87/*
88 * Determine if the instruction uses the %rip-relative addressing mode.
89 * If it does, return the address of the 32-bit displacement word.
90 * If not, return null.
91 */
92static s32 __kprobes *is_riprel(u8 *insn)
93{
94#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
95 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
96 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
97 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
98 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
99 << (row % 64))
100 static const u64 onebyte_has_modrm[256 / 64] = {
101 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
102 /* ------------------------------- */
103 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
104 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
105 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
106 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
107 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
108 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
109 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
110 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
111 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
112 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
113 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
114 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
115 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
116 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
117 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
118 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
119 /* ------------------------------- */
120 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
121 };
122 static const u64 twobyte_has_modrm[256 / 64] = {
123 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
124 /* ------------------------------- */
125 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
126 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
127 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
128 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
129 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
130 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
131 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
132 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
133 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
134 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
135 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
136 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
137 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
138 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
139 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
140 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
141 /* ------------------------------- */
142 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
143 };
144#undef W
145 int need_modrm;
146
147 /* Skip legacy instruction prefixes. */
148 while (1) {
149 switch (*insn) {
150 case 0x66:
151 case 0x67:
152 case 0x2e:
153 case 0x3e:
154 case 0x26:
155 case 0x64:
156 case 0x65:
157 case 0x36:
158 case 0xf0:
159 case 0xf3:
160 case 0xf2:
161 ++insn;
162 continue;
163 }
164 break;
165 }
166
167 /* Skip REX instruction prefix. */
168 if ((*insn & 0xf0) == 0x40)
169 ++insn;
170
171 if (*insn == 0x0f) { /* Two-byte opcode. */
172 ++insn;
173 need_modrm = test_bit(*insn, twobyte_has_modrm);
174 } else { /* One-byte opcode. */
175 need_modrm = test_bit(*insn, onebyte_has_modrm);
176 }
177
178 if (need_modrm) {
179 u8 modrm = *++insn;
180 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
181 /* Displacement follows ModRM byte. */
182 return (s32 *) ++insn;
183 }
184 }
185
186 /* No %rip-relative addressing mode here. */
187 return NULL;
188}
189
190static void __kprobes arch_copy_kprobe(struct kprobe *p)
191{
192 s32 *ripdisp;
193 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
194 ripdisp = is_riprel(p->ainsn.insn);
195 if (ripdisp) {
196 /*
197 * The copied instruction uses the %rip-relative
198 * addressing mode. Adjust the displacement for the
199 * difference between the original location of this
200 * instruction and the location of the copy that will
201 * actually be run. The tricky bit here is making sure
202 * that the sign extension happens correctly in this
203 * calculation, since we need a signed 32-bit result to
204 * be sign-extended to 64 bits when it's added to the
205 * %rip value and yield the same 64-bit result that the
206 * sign-extension of the original signed 32-bit
207 * displacement would have given.
208 */
209 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
210 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
211 *ripdisp = disp;
212 }
213 p->opcode = *p->addr;
214}
215
216void __kprobes arch_arm_kprobe(struct kprobe *p)
217{
218 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
219}
220
221void __kprobes arch_disarm_kprobe(struct kprobe *p)
222{
223 text_poke(p->addr, &p->opcode, 1);
224}
225
226void __kprobes arch_remove_kprobe(struct kprobe *p)
227{
228 mutex_lock(&kprobe_mutex);
229 free_insn_slot(p->ainsn.insn, 0);
230 mutex_unlock(&kprobe_mutex);
231}
232
233static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
234{
235 kcb->prev_kprobe.kp = kprobe_running();
236 kcb->prev_kprobe.status = kcb->kprobe_status;
237 kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
238 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
239}
240
241static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
242{
243 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
244 kcb->kprobe_status = kcb->prev_kprobe.status;
245 kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
246 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
247}
248
249static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
250 struct kprobe_ctlblk *kcb)
251{
252 __get_cpu_var(current_kprobe) = p;
253 kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
254 = (regs->eflags & (TF_MASK | IF_MASK));
255 if (is_IF_modifier(p->ainsn.insn))
256 kcb->kprobe_saved_rflags &= ~IF_MASK;
257}
258
259static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
260{
261 regs->eflags |= TF_MASK;
262 regs->eflags &= ~IF_MASK;
263 /*single step inline if the instruction is an int3*/
264 if (p->opcode == BREAKPOINT_INSTRUCTION)
265 regs->rip = (unsigned long)p->addr;
266 else
267 regs->rip = (unsigned long)p->ainsn.insn;
268}
269
270/* Called with kretprobe_lock held */
271void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
272 struct pt_regs *regs)
273{
274 unsigned long *sara = (unsigned long *)regs->rsp;
275
276 ri->ret_addr = (kprobe_opcode_t *) *sara;
277 /* Replace the return addr with trampoline addr */
278 *sara = (unsigned long) &kretprobe_trampoline;
279}
280
281int __kprobes kprobe_handler(struct pt_regs *regs)
282{
283 struct kprobe *p;
284 int ret = 0;
285 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
286 struct kprobe_ctlblk *kcb;
287
288 /*
289 * We don't want to be preempted for the entire
290 * duration of kprobe processing
291 */
292 preempt_disable();
293 kcb = get_kprobe_ctlblk();
294
295 /* Check we're not actually recursing */
296 if (kprobe_running()) {
297 p = get_kprobe(addr);
298 if (p) {
299 if (kcb->kprobe_status == KPROBE_HIT_SS &&
300 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
301 regs->eflags &= ~TF_MASK;
302 regs->eflags |= kcb->kprobe_saved_rflags;
303 goto no_kprobe;
304 } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
305 /* TODO: Provide re-entrancy from
306 * post_kprobes_handler() and avoid exception
307 * stack corruption while single-stepping on
308 * the instruction of the new probe.
309 */
310 arch_disarm_kprobe(p);
311 regs->rip = (unsigned long)p->addr;
312 reset_current_kprobe();
313 ret = 1;
314 } else {
315 /* We have reentered the kprobe_handler(), since
316 * another probe was hit while within the
317 * handler. We here save the original kprobe
318 * variables and just single step on instruction
319 * of the new probe without calling any user
320 * handlers.
321 */
322 save_previous_kprobe(kcb);
323 set_current_kprobe(p, regs, kcb);
324 kprobes_inc_nmissed_count(p);
325 prepare_singlestep(p, regs);
326 kcb->kprobe_status = KPROBE_REENTER;
327 return 1;
328 }
329 } else {
330 if (*addr != BREAKPOINT_INSTRUCTION) {
331 /* The breakpoint instruction was removed by
332 * another cpu right after we hit, no further
333 * handling of this interrupt is appropriate
334 */
335 regs->rip = (unsigned long)addr;
336 ret = 1;
337 goto no_kprobe;
338 }
339 p = __get_cpu_var(current_kprobe);
340 if (p->break_handler && p->break_handler(p, regs)) {
341 goto ss_probe;
342 }
343 }
344 goto no_kprobe;
345 }
346
347 p = get_kprobe(addr);
348 if (!p) {
349 if (*addr != BREAKPOINT_INSTRUCTION) {
350 /*
351 * The breakpoint instruction was removed right
352 * after we hit it. Another cpu has removed
353 * either a probepoint or a debugger breakpoint
354 * at this address. In either case, no further
355 * handling of this interrupt is appropriate.
356 * Back up over the (now missing) int3 and run
357 * the original instruction.
358 */
359 regs->rip = (unsigned long)addr;
360 ret = 1;
361 }
362 /* Not one of ours: let kernel handle it */
363 goto no_kprobe;
364 }
365
366 set_current_kprobe(p, regs, kcb);
367 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
368
369 if (p->pre_handler && p->pre_handler(p, regs))
370 /* handler has already set things up, so skip ss setup */
371 return 1;
372
373ss_probe:
374 prepare_singlestep(p, regs);
375 kcb->kprobe_status = KPROBE_HIT_SS;
376 return 1;
377
378no_kprobe:
379 preempt_enable_no_resched();
380 return ret;
381}
382
383/*
384 * For function-return probes, init_kprobes() establishes a probepoint
385 * here. When a retprobed function returns, this probe is hit and
386 * trampoline_probe_handler() runs, calling the kretprobe's handler.
387 */
388 void kretprobe_trampoline_holder(void)
389 {
390 asm volatile ( ".global kretprobe_trampoline\n"
391 "kretprobe_trampoline: \n"
392 "nop\n");
393 }
394
395/*
396 * Called when we hit the probe point at kretprobe_trampoline
397 */
398int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
399{
400 struct kretprobe_instance *ri = NULL;
401 struct hlist_head *head, empty_rp;
402 struct hlist_node *node, *tmp;
403 unsigned long flags, orig_ret_address = 0;
404 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
405
406 INIT_HLIST_HEAD(&empty_rp);
407 spin_lock_irqsave(&kretprobe_lock, flags);
408 head = kretprobe_inst_table_head(current);
409
410 /*
411 * It is possible to have multiple instances associated with a given
412 * task either because an multiple functions in the call path
413 * have a return probe installed on them, and/or more then one return
414 * return probe was registered for a target function.
415 *
416 * We can handle this because:
417 * - instances are always inserted at the head of the list
418 * - when multiple return probes are registered for the same
419 * function, the first instance's ret_addr will point to the
420 * real return address, and all the rest will point to
421 * kretprobe_trampoline
422 */
423 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
424 if (ri->task != current)
425 /* another task is sharing our hash bucket */
426 continue;
427
428 if (ri->rp && ri->rp->handler)
429 ri->rp->handler(ri, regs);
430
431 orig_ret_address = (unsigned long)ri->ret_addr;
432 recycle_rp_inst(ri, &empty_rp);
433
434 if (orig_ret_address != trampoline_address)
435 /*
436 * This is the real return address. Any other
437 * instances associated with this task are for
438 * other calls deeper on the call stack
439 */
440 break;
441 }
442
443 kretprobe_assert(ri, orig_ret_address, trampoline_address);
444 regs->rip = orig_ret_address;
445
446 reset_current_kprobe();
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448 preempt_enable_no_resched();
449
450 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
451 hlist_del(&ri->hlist);
452 kfree(ri);
453 }
454 /*
455 * By returning a non-zero value, we are telling
456 * kprobe_handler() that we don't want the post_handler
457 * to run (and have re-enabled preemption)
458 */
459 return 1;
460}
461
462/*
463 * Called after single-stepping. p->addr is the address of the
464 * instruction whose first byte has been replaced by the "int 3"
465 * instruction. To avoid the SMP problems that can occur when we
466 * temporarily put back the original opcode to single-step, we
467 * single-stepped a copy of the instruction. The address of this
468 * copy is p->ainsn.insn.
469 *
470 * This function prepares to return from the post-single-step
471 * interrupt. We have to fix up the stack as follows:
472 *
473 * 0) Except in the case of absolute or indirect jump or call instructions,
474 * the new rip is relative to the copied instruction. We need to make
475 * it relative to the original instruction.
476 *
477 * 1) If the single-stepped instruction was pushfl, then the TF and IF
478 * flags are set in the just-pushed eflags, and may need to be cleared.
479 *
480 * 2) If the single-stepped instruction was a call, the return address
481 * that is atop the stack is the address following the copied instruction.
482 * We need to make it the address following the original instruction.
483 */
484static void __kprobes resume_execution(struct kprobe *p,
485 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
486{
487 unsigned long *tos = (unsigned long *)regs->rsp;
488 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
489 unsigned long orig_rip = (unsigned long)p->addr;
490 kprobe_opcode_t *insn = p->ainsn.insn;
491
492 /*skip the REX prefix*/
493 if (*insn >= 0x40 && *insn <= 0x4f)
494 insn++;
495
496 regs->eflags &= ~TF_MASK;
497 switch (*insn) {
498 case 0x9c: /* pushfl */
499 *tos &= ~(TF_MASK | IF_MASK);
500 *tos |= kcb->kprobe_old_rflags;
501 break;
502 case 0xc2: /* iret/ret/lret */
503 case 0xc3:
504 case 0xca:
505 case 0xcb:
506 case 0xcf:
507 case 0xea: /* jmp absolute -- ip is correct */
508 /* ip is already adjusted, no more changes required */
509 goto no_change;
510 case 0xe8: /* call relative - Fix return addr */
511 *tos = orig_rip + (*tos - copy_rip);
512 break;
513 case 0xff:
514 if ((insn[1] & 0x30) == 0x10) {
515 /* call absolute, indirect */
516 /* Fix return addr; ip is correct. */
517 *tos = orig_rip + (*tos - copy_rip);
518 goto no_change;
519 } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
520 ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
521 /* ip is correct. */
522 goto no_change;
523 }
524 default:
525 break;
526 }
527
528 regs->rip = orig_rip + (regs->rip - copy_rip);
529no_change:
530
531 return;
532}
533
534int __kprobes post_kprobe_handler(struct pt_regs *regs)
535{
536 struct kprobe *cur = kprobe_running();
537 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
538
539 if (!cur)
540 return 0;
541
542 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
543 kcb->kprobe_status = KPROBE_HIT_SSDONE;
544 cur->post_handler(cur, regs, 0);
545 }
546
547 resume_execution(cur, regs, kcb);
548 regs->eflags |= kcb->kprobe_saved_rflags;
549 trace_hardirqs_fixup_flags(regs->eflags);
550
551 /* Restore the original saved kprobes variables and continue. */
552 if (kcb->kprobe_status == KPROBE_REENTER) {
553 restore_previous_kprobe(kcb);
554 goto out;
555 }
556 reset_current_kprobe();
557out:
558 preempt_enable_no_resched();
559
560 /*
561 * if somebody else is singlestepping across a probe point, eflags
562 * will have TF set, in which case, continue the remaining processing
563 * of do_debug, as if this is not a probe hit.
564 */
565 if (regs->eflags & TF_MASK)
566 return 0;
567
568 return 1;
569}
570
571int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
572{
573 struct kprobe *cur = kprobe_running();
574 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
575 const struct exception_table_entry *fixup;
576
577 switch(kcb->kprobe_status) {
578 case KPROBE_HIT_SS:
579 case KPROBE_REENTER:
580 /*
581 * We are here because the instruction being single
582 * stepped caused a page fault. We reset the current
583 * kprobe and the rip points back to the probe address
584 * and allow the page fault handler to continue as a
585 * normal page fault.
586 */
587 regs->rip = (unsigned long)cur->addr;
588 regs->eflags |= kcb->kprobe_old_rflags;
589 if (kcb->kprobe_status == KPROBE_REENTER)
590 restore_previous_kprobe(kcb);
591 else
592 reset_current_kprobe();
593 preempt_enable_no_resched();
594 break;
595 case KPROBE_HIT_ACTIVE:
596 case KPROBE_HIT_SSDONE:
597 /*
598 * We increment the nmissed count for accounting,
599 * we can also use npre/npostfault count for accouting
600 * these specific fault cases.
601 */
602 kprobes_inc_nmissed_count(cur);
603
604 /*
605 * We come here because instructions in the pre/post
606 * handler caused the page_fault, this could happen
607 * if handler tries to access user space by
608 * copy_from_user(), get_user() etc. Let the
609 * user-specified handler try to fix it first.
610 */
611 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
612 return 1;
613
614 /*
615 * In case the user-specified fault handler returned
616 * zero, try to fix up.
617 */
618 fixup = search_exception_tables(regs->rip);
619 if (fixup) {
620 regs->rip = fixup->fixup;
621 return 1;
622 }
623
624 /*
625 * fixup() could not handle it,
626 * Let do_page_fault() fix it.
627 */
628 break;
629 default:
630 break;
631 }
632 return 0;
633}
634
635/*
636 * Wrapper routine for handling exceptions.
637 */
638int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
639 unsigned long val, void *data)
640{
641 struct die_args *args = (struct die_args *)data;
642 int ret = NOTIFY_DONE;
643
644 if (args->regs && user_mode(args->regs))
645 return ret;
646
647 switch (val) {
648 case DIE_INT3:
649 if (kprobe_handler(args->regs))
650 ret = NOTIFY_STOP;
651 break;
652 case DIE_DEBUG:
653 if (post_kprobe_handler(args->regs))
654 ret = NOTIFY_STOP;
655 break;
656 case DIE_GPF:
657 /* kprobe_running() needs smp_processor_id() */
658 preempt_disable();
659 if (kprobe_running() &&
660 kprobe_fault_handler(args->regs, args->trapnr))
661 ret = NOTIFY_STOP;
662 preempt_enable();
663 break;
664 default:
665 break;
666 }
667 return ret;
668}
669
670int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
671{
672 struct jprobe *jp = container_of(p, struct jprobe, kp);
673 unsigned long addr;
674 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
675
676 kcb->jprobe_saved_regs = *regs;
677 kcb->jprobe_saved_rsp = (long *) regs->rsp;
678 addr = (unsigned long)(kcb->jprobe_saved_rsp);
679 /*
680 * As Linus pointed out, gcc assumes that the callee
681 * owns the argument space and could overwrite it, e.g.
682 * tailcall optimization. So, to be absolutely safe
683 * we also save and restore enough stack bytes to cover
684 * the argument area.
685 */
686 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
687 MIN_STACK_SIZE(addr));
688 regs->eflags &= ~IF_MASK;
689 trace_hardirqs_off();
690 regs->rip = (unsigned long)(jp->entry);
691 return 1;
692}
693
694void __kprobes jprobe_return(void)
695{
696 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
697
698 asm volatile (" xchg %%rbx,%%rsp \n"
699 " int3 \n"
700 " .globl jprobe_return_end \n"
701 " jprobe_return_end: \n"
702 " nop \n"::"b"
703 (kcb->jprobe_saved_rsp):"memory");
704}
705
706int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
707{
708 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
709 u8 *addr = (u8 *) (regs->rip - 1);
710 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
711 struct jprobe *jp = container_of(p, struct jprobe, kp);
712
713 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
714 if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
715 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
716 printk("current rsp %p does not match saved rsp %p\n",
717 (long *)regs->rsp, kcb->jprobe_saved_rsp);
718 printk("Saved registers for jprobe %p\n", jp);
719 show_registers(saved_regs);
720 printk("Current registers\n");
721 show_registers(regs);
722 BUG();
723 }
724 *regs = kcb->jprobe_saved_regs;
725 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
726 MIN_STACK_SIZE(stack_addr));
727 preempt_enable_no_resched();
728 return 1;
729 }
730 return 0;
731}
732
733static struct kprobe trampoline_p = {
734 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
735 .pre_handler = trampoline_probe_handler
736};
737
738int __init arch_init_kprobes(void)
739{
740 return register_kprobe(&trampoline_p);
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
746 return 1;
747
748 return 0;
749}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c
index 9ff90a27c45f..8a7660c8394a 100644
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,6 +1,9 @@
1/* 1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds 2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
4 */ 7 */
5 8
6#include <linux/errno.h> 9#include <linux/errno.h>
@@ -9,7 +12,6 @@
9#include <linux/mm.h> 12#include <linux/mm.h>
10#include <linux/smp.h> 13#include <linux/smp.h>
11#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
12#include <linux/slab.h>
13 15
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include <asm/system.h> 17#include <asm/system.h>
@@ -17,7 +19,7 @@
17#include <asm/desc.h> 19#include <asm/desc.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19 21
20#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ 22#ifdef CONFIG_SMP
21static void flush_ldt(void *null) 23static void flush_ldt(void *null)
22{ 24{
23 if (current->active_mm) 25 if (current->active_mm)
@@ -27,26 +29,31 @@ static void flush_ldt(void *null)
27 29
28static int alloc_ldt(mm_context_t *pc, int mincount, int reload) 30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
29{ 31{
30 void *oldldt; 32 void *oldldt, *newldt;
31 void *newldt;
32 int oldsize; 33 int oldsize;
33 34
34 if (mincount <= pc->size) 35 if (mincount <= pc->size)
35 return 0; 36 return 0;
36 oldsize = pc->size; 37 oldsize = pc->size;
37 mincount = (mincount+511)&(~511); 38 mincount = (mincount + 511) & (~511);
38 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) 39 if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
39 newldt = vmalloc(mincount*LDT_ENTRY_SIZE); 40 newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
40 else 41 else
41 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); 42 newldt = (void *)__get_free_page(GFP_KERNEL);
42 43
43 if (!newldt) 44 if (!newldt)
44 return -ENOMEM; 45 return -ENOMEM;
45 46
46 if (oldsize) 47 if (oldsize)
47 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); 48 memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
48 oldldt = pc->ldt; 49 oldldt = pc->ldt;
49 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); 50 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
51 (mincount - oldsize) * LDT_ENTRY_SIZE);
52
53#ifdef CONFIG_X86_64
54 /* CHECKME: Do we really need this ? */
55 wmb();
56#endif
50 pc->ldt = newldt; 57 pc->ldt = newldt;
51 wmb(); 58 wmb();
52 pc->size = mincount; 59 pc->size = mincount;
@@ -55,6 +62,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
55 if (reload) { 62 if (reload) {
56#ifdef CONFIG_SMP 63#ifdef CONFIG_SMP
57 cpumask_t mask; 64 cpumask_t mask;
65
58 preempt_disable(); 66 preempt_disable();
59 load_LDT(pc); 67 load_LDT(pc);
60 mask = cpumask_of_cpu(smp_processor_id()); 68 mask = cpumask_of_cpu(smp_processor_id());
@@ -66,10 +74,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
66#endif 74#endif
67 } 75 }
68 if (oldsize) { 76 if (oldsize) {
69 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) 77 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
70 vfree(oldldt); 78 vfree(oldldt);
71 else 79 else
72 kfree(oldldt); 80 put_page(virt_to_page(oldldt));
73 } 81 }
74 return 0; 82 return 0;
75} 83}
@@ -77,9 +85,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
77static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 85static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
78{ 86{
79 int err = alloc_ldt(new, old->size, 0); 87 int err = alloc_ldt(new, old->size, 0);
88
80 if (err < 0) 89 if (err < 0)
81 return err; 90 return err;
82 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); 91 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
83 return 0; 92 return 0;
84} 93}
85 94
@@ -89,7 +98,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
89 */ 98 */
90int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 99int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
91{ 100{
92 struct mm_struct * old_mm; 101 struct mm_struct *old_mm;
93 int retval = 0; 102 int retval = 0;
94 103
95 mutex_init(&mm->context.lock); 104 mutex_init(&mm->context.lock);
@@ -105,33 +114,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
105 114
106/* 115/*
107 * No need to lock the MM as we are the last user 116 * No need to lock the MM as we are the last user
117 *
118 * 64bit: Don't touch the LDT register - we're already in the next thread.
108 */ 119 */
109void destroy_context(struct mm_struct *mm) 120void destroy_context(struct mm_struct *mm)
110{ 121{
111 if (mm->context.size) { 122 if (mm->context.size) {
123#ifdef CONFIG_X86_32
124 /* CHECKME: Can this ever happen ? */
112 if (mm == current->active_mm) 125 if (mm == current->active_mm)
113 clear_LDT(); 126 clear_LDT();
114 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) 127#endif
128 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
115 vfree(mm->context.ldt); 129 vfree(mm->context.ldt);
116 else 130 else
117 kfree(mm->context.ldt); 131 put_page(virt_to_page(mm->context.ldt));
118 mm->context.size = 0; 132 mm->context.size = 0;
119 } 133 }
120} 134}
121 135
122static int read_ldt(void __user * ptr, unsigned long bytecount) 136static int read_ldt(void __user *ptr, unsigned long bytecount)
123{ 137{
124 int err; 138 int err;
125 unsigned long size; 139 unsigned long size;
126 struct mm_struct * mm = current->mm; 140 struct mm_struct *mm = current->mm;
127 141
128 if (!mm->context.size) 142 if (!mm->context.size)
129 return 0; 143 return 0;
130 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) 144 if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
131 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; 145 bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
132 146
133 mutex_lock(&mm->context.lock); 147 mutex_lock(&mm->context.lock);
134 size = mm->context.size*LDT_ENTRY_SIZE; 148 size = mm->context.size * LDT_ENTRY_SIZE;
135 if (size > bytecount) 149 if (size > bytecount)
136 size = bytecount; 150 size = bytecount;
137 151
@@ -143,7 +157,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
143 goto error_return; 157 goto error_return;
144 if (size != bytecount) { 158 if (size != bytecount) {
145 /* zero-fill the rest */ 159 /* zero-fill the rest */
146 if (clear_user(ptr+size, bytecount-size) != 0) { 160 if (clear_user(ptr + size, bytecount - size) != 0) {
147 err = -EFAULT; 161 err = -EFAULT;
148 goto error_return; 162 goto error_return;
149 } 163 }
@@ -153,34 +167,32 @@ error_return:
153 return err; 167 return err;
154} 168}
155 169
156static int read_default_ldt(void __user * ptr, unsigned long bytecount) 170static int read_default_ldt(void __user *ptr, unsigned long bytecount)
157{ 171{
158 int err; 172 /* CHECKME: Can we use _one_ random number ? */
159 unsigned long size; 173#ifdef CONFIG_X86_32
160 174 unsigned long size = 5 * sizeof(struct desc_struct);
161 err = 0; 175#else
162 size = 5*sizeof(struct desc_struct); 176 unsigned long size = 128;
163 if (size > bytecount) 177#endif
164 size = bytecount; 178 if (bytecount > size)
165 179 bytecount = size;
166 err = size; 180 if (clear_user(ptr, bytecount))
167 if (clear_user(ptr, size)) 181 return -EFAULT;
168 err = -EFAULT; 182 return bytecount;
169
170 return err;
171} 183}
172 184
173static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) 185static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
174{ 186{
175 struct mm_struct * mm = current->mm; 187 struct mm_struct *mm = current->mm;
176 __u32 entry_1, entry_2; 188 struct desc_struct ldt;
177 int error; 189 int error;
178 struct user_desc ldt_info; 190 struct user_desc ldt_info;
179 191
180 error = -EINVAL; 192 error = -EINVAL;
181 if (bytecount != sizeof(ldt_info)) 193 if (bytecount != sizeof(ldt_info))
182 goto out; 194 goto out;
183 error = -EFAULT; 195 error = -EFAULT;
184 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) 196 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
185 goto out; 197 goto out;
186 198
@@ -196,28 +208,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
196 208
197 mutex_lock(&mm->context.lock); 209 mutex_lock(&mm->context.lock);
198 if (ldt_info.entry_number >= mm->context.size) { 210 if (ldt_info.entry_number >= mm->context.size) {
199 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1); 211 error = alloc_ldt(&current->mm->context,
212 ldt_info.entry_number + 1, 1);
200 if (error < 0) 213 if (error < 0)
201 goto out_unlock; 214 goto out_unlock;
202 } 215 }
203 216
204 /* Allow LDTs to be cleared by the user. */ 217 /* Allow LDTs to be cleared by the user. */
205 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { 218 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
206 if (oldmode || LDT_empty(&ldt_info)) { 219 if (oldmode || LDT_empty(&ldt_info)) {
207 entry_1 = 0; 220 memset(&ldt, 0, sizeof(ldt));
208 entry_2 = 0;
209 goto install; 221 goto install;
210 } 222 }
211 } 223 }
212 224
213 entry_1 = LDT_entry_a(&ldt_info); 225 fill_ldt(&ldt, &ldt_info);
214 entry_2 = LDT_entry_b(&ldt_info);
215 if (oldmode) 226 if (oldmode)
216 entry_2 &= ~(1 << 20); 227 ldt.avl = 0;
217 228
218 /* Install the new entry ... */ 229 /* Install the new entry ... */
219install: 230install:
220 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); 231 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
221 error = 0; 232 error = 0;
222 233
223out_unlock: 234out_unlock:
@@ -226,7 +237,8 @@ out:
226 return error; 237 return error;
227} 238}
228 239
229asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) 240asmlinkage int sys_modify_ldt(int func, void __user *ptr,
241 unsigned long bytecount)
230{ 242{
231 int ret = -ENOSYS; 243 int ret = -ENOSYS;
232 244
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
deleted file mode 100644
index 60e57abb8e90..000000000000
--- a/arch/x86/kernel/ldt_64.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/string.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <asm/ldt.h>
20#include <asm/desc.h>
21#include <asm/proto.h>
22
23#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
24static void flush_ldt(void *null)
25{
26 if (current->active_mm)
27 load_LDT(&current->active_mm->context);
28}
29#endif
30
31static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
32{
33 void *oldldt;
34 void *newldt;
35 unsigned oldsize;
36
37 if (mincount <= (unsigned)pc->size)
38 return 0;
39 oldsize = pc->size;
40 mincount = (mincount+511)&(~511);
41 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
42 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
43 else
44 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
45
46 if (!newldt)
47 return -ENOMEM;
48
49 if (oldsize)
50 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
51 oldldt = pc->ldt;
52 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
53 wmb();
54 pc->ldt = newldt;
55 wmb();
56 pc->size = mincount;
57 wmb();
58 if (reload) {
59#ifdef CONFIG_SMP
60 cpumask_t mask;
61
62 preempt_disable();
63 mask = cpumask_of_cpu(smp_processor_id());
64 load_LDT(pc);
65 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
66 smp_call_function(flush_ldt, NULL, 1, 1);
67 preempt_enable();
68#else
69 load_LDT(pc);
70#endif
71 }
72 if (oldsize) {
73 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
74 vfree(oldldt);
75 else
76 kfree(oldldt);
77 }
78 return 0;
79}
80
81static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
82{
83 int err = alloc_ldt(new, old->size, 0);
84 if (err < 0)
85 return err;
86 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
87 return 0;
88}
89
90/*
91 * we do not have to muck with descriptors here, that is
92 * done in switch_mm() as needed.
93 */
94int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
95{
96 struct mm_struct * old_mm;
97 int retval = 0;
98
99 mutex_init(&mm->context.lock);
100 mm->context.size = 0;
101 old_mm = current->mm;
102 if (old_mm && old_mm->context.size > 0) {
103 mutex_lock(&old_mm->context.lock);
104 retval = copy_ldt(&mm->context, &old_mm->context);
105 mutex_unlock(&old_mm->context.lock);
106 }
107 return retval;
108}
109
110/*
111 *
112 * Don't touch the LDT register - we're already in the next thread.
113 */
114void destroy_context(struct mm_struct *mm)
115{
116 if (mm->context.size) {
117 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
118 vfree(mm->context.ldt);
119 else
120 kfree(mm->context.ldt);
121 mm->context.size = 0;
122 }
123}
124
125static int read_ldt(void __user * ptr, unsigned long bytecount)
126{
127 int err;
128 unsigned long size;
129 struct mm_struct * mm = current->mm;
130
131 if (!mm->context.size)
132 return 0;
133 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
134 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
135
136 mutex_lock(&mm->context.lock);
137 size = mm->context.size*LDT_ENTRY_SIZE;
138 if (size > bytecount)
139 size = bytecount;
140
141 err = 0;
142 if (copy_to_user(ptr, mm->context.ldt, size))
143 err = -EFAULT;
144 mutex_unlock(&mm->context.lock);
145 if (err < 0)
146 goto error_return;
147 if (size != bytecount) {
148 /* zero-fill the rest */
149 if (clear_user(ptr+size, bytecount-size) != 0) {
150 err = -EFAULT;
151 goto error_return;
152 }
153 }
154 return bytecount;
155error_return:
156 return err;
157}
158
159static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{
161 /* Arbitrary number */
162 /* x86-64 default LDT is all zeros */
163 if (bytecount > 128)
164 bytecount = 128;
165 if (clear_user(ptr, bytecount))
166 return -EFAULT;
167 return bytecount;
168}
169
170static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
171{
172 struct task_struct *me = current;
173 struct mm_struct * mm = me->mm;
174 __u32 entry_1, entry_2, *lp;
175 int error;
176 struct user_desc ldt_info;
177
178 error = -EINVAL;
179
180 if (bytecount != sizeof(ldt_info))
181 goto out;
182 error = -EFAULT;
183 if (copy_from_user(&ldt_info, ptr, bytecount))
184 goto out;
185
186 error = -EINVAL;
187 if (ldt_info.entry_number >= LDT_ENTRIES)
188 goto out;
189 if (ldt_info.contents == 3) {
190 if (oldmode)
191 goto out;
192 if (ldt_info.seg_not_present == 0)
193 goto out;
194 }
195
196 mutex_lock(&mm->context.lock);
197 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
198 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
199 if (error < 0)
200 goto out_unlock;
201 }
202
203 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
204
205 /* Allow LDTs to be cleared by the user. */
206 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
207 if (oldmode || LDT_empty(&ldt_info)) {
208 entry_1 = 0;
209 entry_2 = 0;
210 goto install;
211 }
212 }
213
214 entry_1 = LDT_entry_a(&ldt_info);
215 entry_2 = LDT_entry_b(&ldt_info);
216 if (oldmode)
217 entry_2 &= ~(1 << 20);
218
219 /* Install the new entry ... */
220install:
221 *lp = entry_1;
222 *(lp+1) = entry_2;
223 error = 0;
224
225out_unlock:
226 mutex_unlock(&mm->context.lock);
227out:
228 return error;
229}
230
231asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
232{
233 int ret = -ENOSYS;
234
235 switch (func) {
236 case 0:
237 ret = read_ldt(ptr, bytecount);
238 break;
239 case 1:
240 ret = write_ldt(ptr, bytecount, 1);
241 break;
242 case 2:
243 ret = read_default_ldt(ptr, bytecount);
244 break;
245 case 0x11:
246 ret = write_ldt(ptr, bytecount, 0);
247 break;
248 }
249 return ret;
250}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 11b935f4f886..c1cfd60639d4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED;
32 32
33static void set_idt(void *newidt, __u16 limit) 33static void set_idt(void *newidt, __u16 limit)
34{ 34{
35 struct Xgt_desc_struct curidt; 35 struct desc_ptr curidt;
36 36
37 /* ia32 supports unaliged loads & stores */ 37 /* ia32 supports unaliged loads & stores */
38 curidt.size = limit; 38 curidt.size = limit;
@@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit)
44 44
45static void set_gdt(void *newgdt, __u16 limit) 45static void set_gdt(void *newgdt, __u16 limit)
46{ 46{
47 struct Xgt_desc_struct curgdt; 47 struct desc_ptr curgdt;
48 48
49 /* ia32 supports unaligned loads & stores */ 49 /* ia32 supports unaligned loads & stores */
50 curgdt.size = limit; 50 curgdt.size = limit;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index aa3d2c8f7737..a1fef42f8cdb 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image)
234void arch_crash_save_vmcoreinfo(void) 234void arch_crash_save_vmcoreinfo(void)
235{ 235{
236 VMCOREINFO_SYMBOL(init_level4_pgt); 236 VMCOREINFO_SYMBOL(init_level4_pgt);
237
238#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE
239 VMCOREINFO_SYMBOL(node_data);
240 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
241#endif
242} 237}
243 238
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3960ab7e1497..219f86eb6123 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s)
63} 63}
64__setup("nomfgpt", mfgpt_disable); 64__setup("nomfgpt", mfgpt_disable);
65 65
66/* Reset the MFGPT timers. This is required by some broken BIOSes which already
67 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
68 * affected at least (0.99 is OK with MFGPT workaround left to off).
69 */
70static int __init mfgpt_fix(char *s)
71{
72 u32 val, dummy;
73
74 /* The following udocumented bit resets the MFGPT timers */
75 val = 0xFF; dummy = 0;
76 wrmsr(0x5140002B, val, dummy);
77 return 1;
78}
79__setup("mfgptfix", mfgpt_fix);
80
66/* 81/*
67 * Check whether any MFGPTs are available for the kernel to use. In most 82 * Check whether any MFGPTs are available for the kernel to use. In most
68 * cases, firmware that uses AMD's VSA code will claim all timers during 83 * cases, firmware that uses AMD's VSA code will claim all timers during
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 40cfd5488719..6ff447f9fda7 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc)
244 return 0; 244 return 0;
245 /* check extended signature checksum */ 245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) { 246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (struct extended_signature *)((void *)ext_header 247 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
248 + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); 248 EXT_SIGNATURE_SIZE * i;
249 sum = orig_sum 249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum) 250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
@@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu)
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) 279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0; 280 return 0;
281 281
282 ext_header = (struct extended_sigtable *)(mc + 282 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
283 get_datasize(mc_header) + MC_HEADER_SIZE);
284 ext_sigcount = ext_header->count; 283 ext_sigcount = ext_header->count;
285 ext_sig = (struct extended_signature *)((void *)ext_header 284 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
286 + EXT_HEADER_SIZE);
287 for (i = 0; i < ext_sigcount; i++) { 285 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header, 286 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf)) 287 ext_sig->sig, ext_sig->pf))
@@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu)
539 pr_debug("ucode data file %s load failed\n", name); 537 pr_debug("ucode data file %s load failed\n", name);
540 return error; 538 return error;
541 } 539 }
542 buf = (void *)firmware->data; 540 buf = firmware->data;
543 size = firmware->size; 541 size = firmware->size;
544 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) 542 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
545 > 0) { 543 > 0) {
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 7a05a7f6099a..67009cdd5eca 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
68/* Processor that is doing the boot up */ 68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U; 69unsigned int boot_cpu_physical_apicid = -1U;
70/* Internal processor count */ 70/* Internal processor count */
71unsigned int __cpuinitdata num_processors; 71unsigned int num_processors;
72 72
73/* Bitmask of physically existing CPUs */ 73/* Bitmask of physically existing CPUs */
74physid_mask_t phys_cpu_present_map; 74physid_mask_t phys_cpu_present_map;
@@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
258 if (!(m->mpc_flags & MPC_APIC_USABLE)) 258 if (!(m->mpc_flags & MPC_APIC_USABLE))
259 return; 259 return;
260 260
261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", 261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
263 if (nr_ioapics >= MAX_IO_APICS) { 263 if (nr_ioapics >= MAX_IO_APICS) {
264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", 264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
405 405
406 mps_oem_check(mpc, oem, str); 406 mps_oem_check(mpc, oem, str);
407 407
408 printk("APIC at: 0x%lX\n",mpc->mpc_lapic); 408 printk("APIC at: 0x%X\n", mpc->mpc_lapic);
409 409
410 /* 410 /*
411 * Save the local APIC address (it might be non-default) -- but only 411 * Save the local APIC address (it might be non-default) -- but only
412 * if we're not using ACPI. 412 * if we're not using ACPI.
413 */ 413 */
@@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
721 unsigned long *bp = phys_to_virt(base); 721 unsigned long *bp = phys_to_virt(base);
722 struct intel_mp_floating *mpf; 722 struct intel_mp_floating *mpf;
723 723
724 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); 724 printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
725 if (sizeof(*mpf) != 16) 725 if (sizeof(*mpf) != 16)
726 printk("Error: MPF size\n"); 726 printk("Error: MPF size\n");
727 727
@@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
734 || (mpf->mpf_specification == 4)) ) { 734 || (mpf->mpf_specification == 4)) ) {
735 735
736 smp_found_config = 1; 736 smp_found_config = 1;
737 printk(KERN_INFO "found SMP MP-table at %08lx\n", 737 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
738 virt_to_phys(mpf)); 738 mpf, virt_to_phys(mpf));
739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); 739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
740 if (mpf->mpf_physptr) { 740 if (mpf->mpf_physptr) {
741 /* 741 /*
@@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
918 */ 918 */
919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
920 mp_ioapic_routing[idx].gsi_base = gsi_base; 920 mp_ioapic_routing[idx].gsi_base = gsi_base;
921 mp_ioapic_routing[idx].gsi_end = gsi_base + 921 mp_ioapic_routing[idx].gsi_end = gsi_base +
922 io_apic_get_redir_entries(idx); 922 io_apic_get_redir_entries(idx);
923 923
924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
927 mp_ioapic_routing[idx].gsi_base, 927 mp_ioapic_routing[idx].gsi_base,
928 mp_ioapic_routing[idx].gsi_end); 928 mp_ioapic_routing[idx].gsi_end);
929} 929}
930 930
931void __init 931void __init
@@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void)
1041} 1041}
1042 1042
1043#define MAX_GSI_NUM 4096 1043#define MAX_GSI_NUM 4096
1044#define IRQ_COMPRESSION_START 64
1044 1045
1045int mp_register_gsi(u32 gsi, int triggering, int polarity) 1046int mp_register_gsi(u32 gsi, int triggering, int polarity)
1046{ 1047{
1047 int ioapic = -1; 1048 int ioapic = -1;
1048 int ioapic_pin = 0; 1049 int ioapic_pin = 0;
1049 int idx, bit = 0; 1050 int idx, bit = 0;
1050 static int pci_irq = 16; 1051 static int pci_irq = IRQ_COMPRESSION_START;
1051 /* 1052 /*
1052 * Mapping between Global System Interrups, which 1053 * Mapping between Global System Interrupts, which
1053 * represent all possible interrupts, and IRQs 1054 * represent all possible interrupts, and IRQs
1054 * assigned to actual devices. 1055 * assigned to actual devices.
1055 */ 1056 */
@@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1086 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { 1087 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1087 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1088 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1088 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1089 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1089 return gsi_to_irq[gsi]; 1090 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1090 } 1091 }
1091 1092
1092 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); 1093 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1093 1094
1094 if (triggering == ACPI_LEVEL_SENSITIVE) { 1095 /*
1096 * For GSI >= 64, use IRQ compression
1097 */
1098 if ((gsi >= IRQ_COMPRESSION_START)
1099 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1095 /* 1100 /*
1096 * For PCI devices assign IRQs in order, avoiding gaps 1101 * For PCI devices assign IRQs in order, avoiding gaps
1097 * due to unused I/O APIC pins. 1102 * due to unused I/O APIC pins.
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index ef4aab123581..72ab1403fed7 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U;
60EXPORT_SYMBOL(boot_cpu_id); 60EXPORT_SYMBOL(boot_cpu_id);
61 61
62/* Internal processor count */ 62/* Internal processor count */
63unsigned int num_processors __cpuinitdata = 0; 63unsigned int num_processors;
64 64
65unsigned disabled_cpus __cpuinitdata; 65unsigned disabled_cpus __cpuinitdata;
66 66
67/* Bitmask of physically existing CPUs */ 67/* Bitmask of physically existing CPUs */
68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
69 69
70u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 70u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
71 = { [0 ... NR_CPUS-1] = BAD_APICID };
72void *x86_bios_cpu_apicid_early_ptr;
73DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
74EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
71 75
72 76
73/* 77/*
@@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
118 physid_set(m->mpc_apicid, phys_cpu_present_map); 122 physid_set(m->mpc_apicid, phys_cpu_present_map);
119 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 123 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
120 /* 124 /*
121 * bios_cpu_apicid is required to have processors listed 125 * x86_bios_cpu_apicid is required to have processors listed
122 * in same order as logical cpu numbers. Hence the first 126 * in same order as logical cpu numbers. Hence the first
123 * entry is BSP, and so on. 127 * entry is BSP, and so on.
124 */ 128 */
125 cpu = 0; 129 cpu = 0;
126 } 130 }
127 bios_cpu_apicid[cpu] = m->mpc_apicid; 131 /* are we being called early in kernel startup? */
128 /* 132 if (x86_cpu_to_apicid_early_ptr) {
129 * We get called early in the the start_kernel initialization 133 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
130 * process when the per_cpu data area is not yet setup, so we 134 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
131 * use a static array that is removed after the per_cpu data 135
132 * area is created. 136 cpu_to_apicid[cpu] = m->mpc_apicid;
133 */ 137 bios_cpu_apicid[cpu] = m->mpc_apicid;
134 if (x86_cpu_to_apicid_ptr) {
135 u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
136 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
137 } else { 138 } else {
138 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; 139 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
140 per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
139 } 141 }
140 142
141 cpu_set(cpu, cpu_possible_map); 143 cpu_set(cpu, cpu_possible_map);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 4f4bfd3a88b6..edd413650b3b 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
51 51
52static int endflag __initdata = 0; 52static int endflag __initdata = 0;
53 53
54#ifdef CONFIG_SMP
54/* The performance counters used by NMI_LOCAL_APIC don't trigger when 55/* The performance counters used by NMI_LOCAL_APIC don't trigger when
55 * the CPU is idle. To make sure the NMI watchdog really ticks on all 56 * the CPU is idle. To make sure the NMI watchdog really ticks on all
56 * CPUs during the test make them busy. 57 * CPUs during the test make them busy.
57 */ 58 */
58static __init void nmi_cpu_busy(void *data) 59static __init void nmi_cpu_busy(void *data)
59{ 60{
60#ifdef CONFIG_SMP
61 local_irq_enable_in_hardirq(); 61 local_irq_enable_in_hardirq();
62 /* Intentionally don't use cpu_relax here. This is 62 /* Intentionally don't use cpu_relax here. This is
63 to make sure that the performance counter really ticks, 63 to make sure that the performance counter really ticks,
@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
67 care if they get somewhat less cycles. */ 67 care if they get somewhat less cycles. */
68 while (endflag == 0) 68 while (endflag == 0)
69 mb(); 69 mb();
70#endif
71} 70}
71#endif
72 72
73static int __init check_nmi_watchdog(void) 73static int __init check_nmi_watchdog(void)
74{ 74{
@@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void)
87 87
88 printk(KERN_INFO "Testing NMI watchdog ... "); 88 printk(KERN_INFO "Testing NMI watchdog ... ");
89 89
90#ifdef CONFIG_SMP
90 if (nmi_watchdog == NMI_LOCAL_APIC) 91 if (nmi_watchdog == NMI_LOCAL_APIC)
91 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 92 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
93#endif
92 94
93 for_each_possible_cpu(cpu) 95 for_each_possible_cpu(cpu)
94 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; 96 prev_nmi_count[cpu] = nmi_count(cpu);
95 local_irq_enable(); 97 local_irq_enable();
96 mdelay((20*1000)/nmi_hz); // wait 20 ticks 98 mdelay((20*1000)/nmi_hz); // wait 20 ticks
97 99
@@ -237,10 +239,10 @@ void acpi_nmi_disable(void)
237 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 239 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
238} 240}
239 241
240void setup_apic_nmi_watchdog (void *unused) 242void setup_apic_nmi_watchdog(void *unused)
241{ 243{
242 if (__get_cpu_var(wd_enabled)) 244 if (__get_cpu_var(wd_enabled))
243 return; 245 return;
244 246
245 /* cheap hack to support suspend/resume */ 247 /* cheap hack to support suspend/resume */
246 /* if cpu0 is not active neither should the other cpus */ 248 /* if cpu0 is not active neither should the other cpus */
@@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
329 unsigned int sum; 331 unsigned int sum;
330 int touched = 0; 332 int touched = 0;
331 int cpu = smp_processor_id(); 333 int cpu = smp_processor_id();
332 int rc=0; 334 int rc = 0;
333 335
334 /* check for other users first */ 336 /* check for other users first */
335 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 337 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index c3d1476b6a11..fb99484d21cf 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
39 * 0: the lapic NMI watchdog is disabled, but can be enabled 39 * 0: the lapic NMI watchdog is disabled, but can be enabled
40 */ 40 */
41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
42int panic_on_timeout; 42static int panic_on_timeout;
43 43
44unsigned int nmi_watchdog = NMI_DEFAULT; 44unsigned int nmi_watchdog = NMI_DEFAULT;
45static unsigned int nmi_hz = HZ; 45static unsigned int nmi_hz = HZ;
@@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data)
78} 78}
79#endif 79#endif
80 80
81int __init check_nmi_watchdog (void) 81int __init check_nmi_watchdog(void)
82{ 82{
83 int *counts; 83 int *prev_nmi_count;
84 int cpu; 84 int cpu;
85 85
86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
87 return 0; 87 return 0;
88 88
89 if (!atomic_read(&nmi_active)) 89 if (!atomic_read(&nmi_active))
90 return 0; 90 return 0;
91 91
92 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 92 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
93 if (!counts) 93 if (!prev_nmi_count)
94 return -1; 94 return -1;
95 95
96 printk(KERN_INFO "testing NMI watchdog ... "); 96 printk(KERN_INFO "Testing NMI watchdog ... ");
97 97
98#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
99 if (nmi_watchdog == NMI_LOCAL_APIC) 99 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void)
101#endif 101#endif
102 102
103 for (cpu = 0; cpu < NR_CPUS; cpu++) 103 for (cpu = 0; cpu < NR_CPUS; cpu++)
104 counts[cpu] = cpu_pda(cpu)->__nmi_count; 104 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
105 local_irq_enable(); 105 local_irq_enable();
106 mdelay((20*1000)/nmi_hz); // wait 20 ticks 106 mdelay((20*1000)/nmi_hz); // wait 20 ticks
107 107
108 for_each_online_cpu(cpu) { 108 for_each_online_cpu(cpu) {
109 if (!per_cpu(wd_enabled, cpu)) 109 if (!per_cpu(wd_enabled, cpu))
110 continue; 110 continue;
111 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 111 if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
112 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 112 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
113 "appears to be stuck (%d->%d)!\n", 113 "appears to be stuck (%d->%d)!\n",
114 cpu, 114 cpu,
115 counts[cpu], 115 prev_nmi_count[cpu],
116 cpu_pda(cpu)->__nmi_count); 116 cpu_pda(cpu)->__nmi_count);
117 per_cpu(wd_enabled, cpu) = 0; 117 per_cpu(wd_enabled, cpu) = 0;
118 atomic_dec(&nmi_active); 118 atomic_dec(&nmi_active);
119 } 119 }
120 } 120 }
121 endflag = 1;
121 if (!atomic_read(&nmi_active)) { 122 if (!atomic_read(&nmi_active)) {
122 kfree(counts); 123 kfree(prev_nmi_count);
123 atomic_set(&nmi_active, -1); 124 atomic_set(&nmi_active, -1);
124 endflag = 1;
125 return -1; 125 return -1;
126 } 126 }
127 endflag = 1;
128 printk("OK.\n"); 127 printk("OK.\n");
129 128
130 /* now that we know it works we can reduce NMI frequency to 129 /* now that we know it works we can reduce NMI frequency to
@@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void)
132 if (nmi_watchdog == NMI_LOCAL_APIC) 131 if (nmi_watchdog == NMI_LOCAL_APIC)
133 nmi_hz = lapic_adjust_nmi_hz(1); 132 nmi_hz = lapic_adjust_nmi_hz(1);
134 133
135 kfree(counts); 134 kfree(prev_nmi_count);
136 return 0; 135 return 0;
137} 136}
138 137
139int __init setup_nmi_watchdog(char *str) 138static int __init setup_nmi_watchdog(char *str)
140{ 139{
141 int nmi; 140 int nmi;
142 141
@@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str)
159 158
160__setup("nmi_watchdog=", setup_nmi_watchdog); 159__setup("nmi_watchdog=", setup_nmi_watchdog);
161 160
162
163static void __acpi_nmi_disable(void *__unused)
164{
165 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
166}
167
168/*
169 * Disable timer based NMIs on all CPUs:
170 */
171void acpi_nmi_disable(void)
172{
173 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
174 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
175}
176
177static void __acpi_nmi_enable(void *__unused)
178{
179 apic_write(APIC_LVT0, APIC_DM_NMI);
180}
181
182/*
183 * Enable timer based NMIs on all CPUs:
184 */
185void acpi_nmi_enable(void)
186{
187 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
188 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
189}
190#ifdef CONFIG_PM 161#ifdef CONFIG_PM
191 162
192static int nmi_pm_active; /* nmi_active before suspend */ 163static int nmi_pm_active; /* nmi_active before suspend */
@@ -217,7 +188,7 @@ static struct sysdev_class nmi_sysclass = {
217}; 188};
218 189
219static struct sys_device device_lapic_nmi = { 190static struct sys_device device_lapic_nmi = {
220 .id = 0, 191 .id = 0,
221 .cls = &nmi_sysclass, 192 .cls = &nmi_sysclass,
222}; 193};
223 194
@@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void)
231 if (nmi_watchdog != NMI_LOCAL_APIC) 202 if (nmi_watchdog != NMI_LOCAL_APIC)
232 return 0; 203 return 0;
233 204
234 if ( atomic_read(&nmi_active) < 0 ) 205 if (atomic_read(&nmi_active) < 0)
235 return 0; 206 return 0;
236 207
237 error = sysdev_class_register(&nmi_sysclass); 208 error = sysdev_class_register(&nmi_sysclass);
@@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs);
244 215
245#endif /* CONFIG_PM */ 216#endif /* CONFIG_PM */
246 217
218static void __acpi_nmi_enable(void *__unused)
219{
220 apic_write(APIC_LVT0, APIC_DM_NMI);
221}
222
223/*
224 * Enable timer based NMIs on all CPUs:
225 */
226void acpi_nmi_enable(void)
227{
228 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
229 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
230}
231
232static void __acpi_nmi_disable(void *__unused)
233{
234 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
235}
236
237/*
238 * Disable timer based NMIs on all CPUs:
239 */
240void acpi_nmi_disable(void)
241{
242 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
243 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
244}
245
247void setup_apic_nmi_watchdog(void *unused) 246void setup_apic_nmi_watchdog(void *unused)
248{ 247{
249 if (__get_cpu_var(wd_enabled) == 1) 248 if (__get_cpu_var(wd_enabled))
250 return; 249 return;
251 250
252 /* cheap hack to support suspend/resume */ 251 /* cheap hack to support suspend/resume */
@@ -311,8 +310,9 @@ void touch_nmi_watchdog(void)
311 } 310 }
312 } 311 }
313 312
314 touch_softlockup_watchdog(); 313 touch_softlockup_watchdog();
315} 314}
315EXPORT_SYMBOL(touch_nmi_watchdog);
316 316
317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) 317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
318{ 318{
@@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void)
479 479
480EXPORT_SYMBOL(nmi_active); 480EXPORT_SYMBOL(nmi_active);
481EXPORT_SYMBOL(nmi_watchdog); 481EXPORT_SYMBOL(nmi_watchdog);
482EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 9000d82c6dc0..e65281b1634b 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void)
82{ 82{
83 if (num_online_nodes() > 1) { 83 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 tsc_disable = 1; 85 setup_clear_cpu_cap(X86_FEATURE_TSC);
86 } 86 }
87 return 0; 87 return 0;
88} 88}
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c
index f5000799f8ef..075962cc75ab 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,7 +14,10 @@
14 You should have received a copy of the GNU General Public License 14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
17*/ 19*/
20
18#include <linux/errno.h> 21#include <linux/errno.h>
19#include <linux/module.h> 22#include <linux/module.h>
20#include <linux/efi.h> 23#include <linux/efi.h>
@@ -55,59 +58,9 @@ char *memory_setup(void)
55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 58 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 59 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 60
58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69
70/* Undefined instruction for dealing with missing ops pointers. */ 61/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b }; 62static const unsigned char ud2a[] = { 0x0f, 0x0b };
72 63
73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
74 unsigned long addr, unsigned len)
75{
76 const unsigned char *start, *end;
77 unsigned ret;
78
79 switch(type) {
80#define SITE(ops, x) \
81 case PARAVIRT_PATCH(ops.x): \
82 start = start_##ops##_##x; \
83 end = end_##ops##_##x; \
84 goto patch_site
85
86 SITE(pv_irq_ops, irq_disable);
87 SITE(pv_irq_ops, irq_enable);
88 SITE(pv_irq_ops, restore_fl);
89 SITE(pv_irq_ops, save_fl);
90 SITE(pv_cpu_ops, iret);
91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
97#undef SITE
98
99 patch_site:
100 ret = paravirt_patch_insns(ibuf, len, start, end);
101 break;
102
103 default:
104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
105 break;
106 }
107
108 return ret;
109}
110
111unsigned paravirt_patch_nop(void) 64unsigned paravirt_patch_nop(void)
112{ 65{
113 return 0; 66 return 0;
@@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
186 /* If the operation is a nop, then nop the callsite */ 139 /* If the operation is a nop, then nop the callsite */
187 ret = paravirt_patch_nop(); 140 ret = paravirt_patch_nop();
188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 141 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) 142 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
190 /* If operation requires a jmp, then jmp */ 143 /* If operation requires a jmp, then jmp */
191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); 144 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
192 else 145 else
@@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr)
237 190
238/* These are in entry.S */ 191/* These are in entry.S */
239extern void native_iret(void); 192extern void native_iret(void);
240extern void native_irq_enable_sysexit(void); 193extern void native_irq_enable_syscall_ret(void);
241 194
242static int __init print_banner(void) 195static int __init print_banner(void)
243{ 196{
@@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
285 238
286static inline void enter_lazy(enum paravirt_lazy_mode mode) 239static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{ 240{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 241 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible()); 242 BUG_ON(preemptible());
290 243
291 x86_write_percpu(paravirt_lazy_mode, mode); 244 __get_cpu_var(paravirt_lazy_mode) = mode;
292} 245}
293 246
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 247void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{ 248{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); 249 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible()); 250 BUG_ON(preemptible());
298 251
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); 252 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
300} 253}
301 254
302void paravirt_enter_lazy_mmu(void) 255void paravirt_enter_lazy_mmu(void)
@@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void)
321 274
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 275enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{ 276{
324 return x86_read_percpu(paravirt_lazy_mode); 277 return __get_cpu_var(paravirt_lazy_mode);
325} 278}
326 279
327struct pv_info pv_info = { 280struct pv_info pv_info = {
@@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = {
366 .read_cr4 = native_read_cr4, 319 .read_cr4 = native_read_cr4,
367 .read_cr4_safe = native_read_cr4_safe, 320 .read_cr4_safe = native_read_cr4_safe,
368 .write_cr4 = native_write_cr4, 321 .write_cr4 = native_write_cr4,
322#ifdef CONFIG_X86_64
323 .read_cr8 = native_read_cr8,
324 .write_cr8 = native_write_cr8,
325#endif
369 .wbinvd = native_wbinvd, 326 .wbinvd = native_wbinvd,
370 .read_msr = native_read_msr_safe, 327 .read_msr = native_read_msr_safe,
371 .write_msr = native_write_msr_safe, 328 .write_msr = native_write_msr_safe,
372 .read_tsc = native_read_tsc, 329 .read_tsc = native_read_tsc,
373 .read_pmc = native_read_pmc, 330 .read_pmc = native_read_pmc,
331 .read_tscp = native_read_tscp,
374 .load_tr_desc = native_load_tr_desc, 332 .load_tr_desc = native_load_tr_desc,
375 .set_ldt = native_set_ldt, 333 .set_ldt = native_set_ldt,
376 .load_gdt = native_load_gdt, 334 .load_gdt = native_load_gdt,
@@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = {
379 .store_idt = native_store_idt, 337 .store_idt = native_store_idt,
380 .store_tr = native_store_tr, 338 .store_tr = native_store_tr,
381 .load_tls = native_load_tls, 339 .load_tls = native_load_tls,
382 .write_ldt_entry = write_dt_entry, 340 .write_ldt_entry = native_write_ldt_entry,
383 .write_gdt_entry = write_dt_entry, 341 .write_gdt_entry = native_write_gdt_entry,
384 .write_idt_entry = write_dt_entry, 342 .write_idt_entry = native_write_idt_entry,
385 .load_esp0 = native_load_esp0, 343 .load_sp0 = native_load_sp0,
386 344
387 .irq_enable_sysexit = native_irq_enable_sysexit, 345 .irq_enable_syscall_ret = native_irq_enable_syscall_ret,
388 .iret = native_iret, 346 .iret = native_iret,
347 .swapgs = native_swapgs,
389 348
390 .set_iopl_mask = native_set_iopl_mask, 349 .set_iopl_mask = native_set_iopl_mask,
391 .io_delay = native_io_delay, 350 .io_delay = native_io_delay,
@@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = {
408}; 367};
409 368
410struct pv_mmu_ops pv_mmu_ops = { 369struct pv_mmu_ops pv_mmu_ops = {
370#ifndef CONFIG_X86_64
411 .pagetable_setup_start = native_pagetable_setup_start, 371 .pagetable_setup_start = native_pagetable_setup_start,
412 .pagetable_setup_done = native_pagetable_setup_done, 372 .pagetable_setup_done = native_pagetable_setup_done,
373#endif
413 374
414 .read_cr2 = native_read_cr2, 375 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2, 376 .write_cr2 = native_write_cr2,
@@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = {
437 .kmap_atomic_pte = kmap_atomic, 398 .kmap_atomic_pte = kmap_atomic,
438#endif 399#endif
439 400
401#if PAGETABLE_LEVELS >= 3
440#ifdef CONFIG_X86_PAE 402#ifdef CONFIG_X86_PAE
441 .set_pte_atomic = native_set_pte_atomic, 403 .set_pte_atomic = native_set_pte_atomic,
442 .set_pte_present = native_set_pte_present, 404 .set_pte_present = native_set_pte_present,
443 .set_pud = native_set_pud,
444 .pte_clear = native_pte_clear, 405 .pte_clear = native_pte_clear,
445 .pmd_clear = native_pmd_clear, 406 .pmd_clear = native_pmd_clear,
446 407#endif
408 .set_pud = native_set_pud,
447 .pmd_val = native_pmd_val, 409 .pmd_val = native_pmd_val,
448 .make_pmd = native_make_pmd, 410 .make_pmd = native_make_pmd,
411
412#if PAGETABLE_LEVELS == 4
413 .pud_val = native_pud_val,
414 .make_pud = native_make_pud,
415 .set_pgd = native_set_pgd,
449#endif 416#endif
417#endif /* PAGETABLE_LEVELS >= 3 */
450 418
451 .pte_val = native_pte_val, 419 .pte_val = native_pte_val,
452 .pgd_val = native_pgd_val, 420 .pgd_val = native_pgd_val,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 000000000000..82fc5fcab4f4
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,49 @@
1#include <asm/paravirt.h>
2
3DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
4DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
7DEF_NATIVE(pv_cpu_ops, iret, "iret");
8DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len)
17{
18 const unsigned char *start, *end;
19 unsigned ret;
20
21#define PATCH_SITE(ops, x) \
22 case PARAVIRT_PATCH(ops.x): \
23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \
25 goto patch_site
26 switch(type) {
27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl);
30 PATCH_SITE(pv_irq_ops, save_fl);
31 PATCH_SITE(pv_cpu_ops, iret);
32 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
33 PATCH_SITE(pv_mmu_ops, read_cr2);
34 PATCH_SITE(pv_mmu_ops, read_cr3);
35 PATCH_SITE(pv_mmu_ops, write_cr3);
36 PATCH_SITE(pv_cpu_ops, clts);
37 PATCH_SITE(pv_cpu_ops, read_tsc);
38
39 patch_site:
40 ret = paravirt_patch_insns(ibuf, len, start, end);
41 break;
42
43 default:
44 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
45 break;
46 }
47#undef PATCH_SITE
48 return ret;
49}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 000000000000..7d904e138d7e
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,57 @@
1#include <asm/paravirt.h>
2#include <asm/asm-offsets.h>
3#include <linux/stringify.h>
4
5DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
6DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
7DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
8DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
9DEF_NATIVE(pv_cpu_ops, iret, "iretq");
10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
13DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, clts, "clts");
15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
16
17/* the three commands give us more control to how to return from a syscall */
18DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
19DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
20
21unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
22 unsigned long addr, unsigned len)
23{
24 const unsigned char *start, *end;
25 unsigned ret;
26
27#define PATCH_SITE(ops, x) \
28 case PARAVIRT_PATCH(ops.x): \
29 start = start_##ops##_##x; \
30 end = end_##ops##_##x; \
31 goto patch_site
32 switch(type) {
33 PATCH_SITE(pv_irq_ops, restore_fl);
34 PATCH_SITE(pv_irq_ops, save_fl);
35 PATCH_SITE(pv_irq_ops, irq_enable);
36 PATCH_SITE(pv_irq_ops, irq_disable);
37 PATCH_SITE(pv_cpu_ops, iret);
38 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
39 PATCH_SITE(pv_cpu_ops, swapgs);
40 PATCH_SITE(pv_mmu_ops, read_cr2);
41 PATCH_SITE(pv_mmu_ops, read_cr3);
42 PATCH_SITE(pv_mmu_ops, write_cr3);
43 PATCH_SITE(pv_cpu_ops, clts);
44 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
45 PATCH_SITE(pv_cpu_ops, wbinvd);
46
47 patch_site:
48 ret = paravirt_patch_insns(ibuf, len, start, end);
49 break;
50
51 default:
52 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
53 break;
54 }
55#undef PATCH_SITE
56 return ret;
57}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6bf1f716909d..21f34db2c03c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -30,7 +30,6 @@
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/dma-mapping.h> 32#include <linux/dma-mapping.h>
33#include <linux/init.h>
34#include <linux/bitops.h> 33#include <linux/bitops.h>
35#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
36#include <linux/pci.h> 35#include <linux/pci.h>
@@ -183,7 +182,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
183 182
184/* enable this to stress test the chip's TCE cache */ 183/* enable this to stress test the chip's TCE cache */
185#ifdef CONFIG_IOMMU_DEBUG 184#ifdef CONFIG_IOMMU_DEBUG
186int debugging __read_mostly = 1; 185static int debugging = 1;
187 186
188static inline unsigned long verify_bit_range(unsigned long* bitmap, 187static inline unsigned long verify_bit_range(unsigned long* bitmap,
189 int expected, unsigned long start, unsigned long end) 188 int expected, unsigned long start, unsigned long end)
@@ -202,7 +201,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
202 return ~0UL; 201 return ~0UL;
203} 202}
204#else /* debugging is disabled */ 203#else /* debugging is disabled */
205int debugging __read_mostly = 0; 204static int debugging;
206 205
207static inline unsigned long verify_bit_range(unsigned long* bitmap, 206static inline unsigned long verify_bit_range(unsigned long* bitmap,
208 int expected, unsigned long start, unsigned long end) 207 int expected, unsigned long start, unsigned long end)
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index 5552d23d23c2..a82473d192a3 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -13,7 +13,6 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14 14
15int iommu_merge __read_mostly = 0; 15int iommu_merge __read_mostly = 0;
16EXPORT_SYMBOL(iommu_merge);
17 16
18dma_addr_t bad_dma_address __read_mostly; 17dma_addr_t bad_dma_address __read_mostly;
19EXPORT_SYMBOL(bad_dma_address); 18EXPORT_SYMBOL(bad_dma_address);
@@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask);
230 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 229 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
231 * documentation. 230 * documentation.
232 */ 231 */
233__init int iommu_setup(char *p) 232static __init int iommu_setup(char *p)
234{ 233{
235 iommu_merge = 1; 234 iommu_merge = 1;
236 235
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 06bcba536045..4d5cc7181982 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Dynamic DMA mapping support for AMD Hammer. 2 * Dynamic DMA mapping support for AMD Hammer.
3 * 3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. 4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems 5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB. 6 * with more than 4GB.
7 * 7 *
8 * See Documentation/DMA-mapping.txt for the interface specification. 8 * See Documentation/DMA-mapping.txt for the interface specification.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * Subject to the GNU General Public License v2 only. 11 * Subject to the GNU General Public License v2 only.
12 */ 12 */
@@ -37,23 +37,26 @@
37#include <asm/k8.h> 37#include <asm/k8.h>
38 38
39static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 39static unsigned long iommu_bus_base; /* GART remapping area (physical) */
40static unsigned long iommu_size; /* size of remapping area bytes */ 40static unsigned long iommu_size; /* size of remapping area bytes */
41static unsigned long iommu_pages; /* .. and in pages */ 41static unsigned long iommu_pages; /* .. and in pages */
42 42
43static u32 *iommu_gatt_base; /* Remapping table */ 43static u32 *iommu_gatt_base; /* Remapping table */
44 44
45/* If this is disabled the IOMMU will use an optimized flushing strategy 45/*
46 of only flushing when an mapping is reused. With it true the GART is flushed 46 * If this is disabled the IOMMU will use an optimized flushing strategy
47 for every mapping. Problem is that doing the lazy flush seems to trigger 47 * of only flushing when an mapping is reused. With it true the GART is
48 bugs with some popular PCI cards, in particular 3ware (but has been also 48 * flushed for every mapping. Problem is that doing the lazy flush seems
49 also seen with Qlogic at least). */ 49 * to trigger bugs with some popular PCI cards, in particular 3ware (but
50 * has been also also seen with Qlogic at least).
51 */
50int iommu_fullflush = 1; 52int iommu_fullflush = 1;
51 53
52/* Allocation bitmap for the remapping area */ 54/* Allocation bitmap for the remapping area: */
53static DEFINE_SPINLOCK(iommu_bitmap_lock); 55static DEFINE_SPINLOCK(iommu_bitmap_lock);
54static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ 56/* Guarded by iommu_bitmap_lock: */
57static unsigned long *iommu_gart_bitmap;
55 58
56static u32 gart_unmapped_entry; 59static u32 gart_unmapped_entry;
57 60
58#define GPTE_VALID 1 61#define GPTE_VALID 1
59#define GPTE_COHERENT 2 62#define GPTE_COHERENT 2
@@ -61,10 +64,10 @@ static u32 gart_unmapped_entry;
61 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 64 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
62#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 65#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
63 66
64#define to_pages(addr,size) \ 67#define to_pages(addr, size) \
65 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 68 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
66 69
67#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
68 71
69#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
70#define AGPEXTERN extern 73#define AGPEXTERN extern
@@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved;
77AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
78 81
79static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
80static int need_flush; /* global flush state. set for each gart wrap */ 83static int need_flush; /* global flush state. set for each gart wrap */
81 84
82static unsigned long alloc_iommu(int size) 85static unsigned long alloc_iommu(int size)
83{ 86{
84 unsigned long offset, flags; 87 unsigned long offset, flags;
85 88
86 spin_lock_irqsave(&iommu_bitmap_lock, flags); 89 spin_lock_irqsave(&iommu_bitmap_lock, flags);
87 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); 90 offset = find_next_zero_string(iommu_gart_bitmap, next_bit,
91 iommu_pages, size);
88 if (offset == -1) { 92 if (offset == -1) {
89 need_flush = 1; 93 need_flush = 1;
90 offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); 94 offset = find_next_zero_string(iommu_gart_bitmap, 0,
95 iommu_pages, size);
91 } 96 }
92 if (offset != -1) { 97 if (offset != -1) {
93 set_bit_string(iommu_gart_bitmap, offset, size); 98 set_bit_string(iommu_gart_bitmap, offset, size);
94 next_bit = offset+size; 99 next_bit = offset+size;
95 if (next_bit >= iommu_pages) { 100 if (next_bit >= iommu_pages) {
96 next_bit = 0; 101 next_bit = 0;
97 need_flush = 1; 102 need_flush = 1;
98 } 103 }
99 } 104 }
100 if (iommu_fullflush) 105 if (iommu_fullflush)
101 need_flush = 1; 106 need_flush = 1;
102 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 107 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
108
103 return offset; 109 return offset;
104} 110}
105 111
106static void free_iommu(unsigned long offset, int size) 112static void free_iommu(unsigned long offset, int size)
107{ 113{
108 unsigned long flags; 114 unsigned long flags;
115
109 spin_lock_irqsave(&iommu_bitmap_lock, flags); 116 spin_lock_irqsave(&iommu_bitmap_lock, flags);
110 __clear_bit_string(iommu_gart_bitmap, offset, size); 117 __clear_bit_string(iommu_gart_bitmap, offset, size);
111 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 118 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
112} 119}
113 120
114/* 121/*
115 * Use global flush state to avoid races with multiple flushers. 122 * Use global flush state to avoid races with multiple flushers.
116 */ 123 */
117static void flush_gart(void) 124static void flush_gart(void)
118{ 125{
119 unsigned long flags; 126 unsigned long flags;
127
120 spin_lock_irqsave(&iommu_bitmap_lock, flags); 128 spin_lock_irqsave(&iommu_bitmap_lock, flags);
121 if (need_flush) { 129 if (need_flush) {
122 k8_flush_garts(); 130 k8_flush_garts();
123 need_flush = 0; 131 need_flush = 0;
124 } 132 }
125 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 133 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
126} 134}
127 135
128#ifdef CONFIG_IOMMU_LEAK 136#ifdef CONFIG_IOMMU_LEAK
129 137
130#define SET_LEAK(x) if (iommu_leak_tab) \ 138#define SET_LEAK(x) \
131 iommu_leak_tab[x] = __builtin_return_address(0); 139 do { \
132#define CLEAR_LEAK(x) if (iommu_leak_tab) \ 140 if (iommu_leak_tab) \
133 iommu_leak_tab[x] = NULL; 141 iommu_leak_tab[x] = __builtin_return_address(0);\
142 } while (0)
143
144#define CLEAR_LEAK(x) \
145 do { \
146 if (iommu_leak_tab) \
147 iommu_leak_tab[x] = NULL; \
148 } while (0)
134 149
135/* Debugging aid for drivers that don't free their IOMMU tables */ 150/* Debugging aid for drivers that don't free their IOMMU tables */
136static void **iommu_leak_tab; 151static void **iommu_leak_tab;
137static int leak_trace; 152static int leak_trace;
138static int iommu_leak_pages = 20; 153static int iommu_leak_pages = 20;
154
139static void dump_leak(void) 155static void dump_leak(void)
140{ 156{
141 int i; 157 int i;
142 static int dump; 158 static int dump;
143 if (dump || !iommu_leak_tab) return; 159
160 if (dump || !iommu_leak_tab)
161 return;
144 dump = 1; 162 dump = 1;
145 show_stack(NULL,NULL); 163 show_stack(NULL, NULL);
146 /* Very crude. dump some from the end of the table too */ 164
147 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 165 /* Very crude. dump some from the end of the table too */
148 for (i = 0; i < iommu_leak_pages; i+=2) { 166 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
149 printk("%lu: ", iommu_pages-i); 167 iommu_leak_pages);
150 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); 168 for (i = 0; i < iommu_leak_pages; i += 2) {
151 printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 169 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
152 } 170 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
153 printk("\n"); 171 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
172 }
173 printk(KERN_DEBUG "\n");
154} 174}
155#else 175#else
156#define SET_LEAK(x) 176# define SET_LEAK(x)
157#define CLEAR_LEAK(x) 177# define CLEAR_LEAK(x)
158#endif 178#endif
159 179
160static void iommu_full(struct device *dev, size_t size, int dir) 180static void iommu_full(struct device *dev, size_t size, int dir)
161{ 181{
162 /* 182 /*
163 * Ran out of IOMMU space for this operation. This is very bad. 183 * Ran out of IOMMU space for this operation. This is very bad.
164 * Unfortunately the drivers cannot handle this operation properly. 184 * Unfortunately the drivers cannot handle this operation properly.
165 * Return some non mapped prereserved space in the aperture and 185 * Return some non mapped prereserved space in the aperture and
166 * let the Northbridge deal with it. This will result in garbage 186 * let the Northbridge deal with it. This will result in garbage
167 * in the IO operation. When the size exceeds the prereserved space 187 * in the IO operation. When the size exceeds the prereserved space
168 * memory corruption will occur or random memory will be DMAed 188 * memory corruption will occur or random memory will be DMAed
169 * out. Hopefully no network devices use single mappings that big. 189 * out. Hopefully no network devices use single mappings that big.
170 */ 190 */
171 191
172 printk(KERN_ERR 192 printk(KERN_ERR
173 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", 193 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
174 size, dev->bus_id); 194 size, dev->bus_id);
175 195
176 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 196 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
177 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 197 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
178 panic("PCI-DMA: Memory would be corrupted\n"); 198 panic("PCI-DMA: Memory would be corrupted\n");
179 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 199 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
180 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); 200 panic(KERN_ERR
181 } 201 "PCI-DMA: Random memory would be DMAed\n");
182 202 }
183#ifdef CONFIG_IOMMU_LEAK 203#ifdef CONFIG_IOMMU_LEAK
184 dump_leak(); 204 dump_leak();
185#endif 205#endif
186} 206}
187 207
188static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) 208static inline int
189{ 209need_iommu(struct device *dev, unsigned long addr, size_t size)
210{
190 u64 mask = *dev->dma_mask; 211 u64 mask = *dev->dma_mask;
191 int high = addr + size > mask; 212 int high = addr + size > mask;
192 int mmu = high; 213 int mmu = high;
193 if (force_iommu) 214
194 mmu = 1; 215 if (force_iommu)
195 return mmu; 216 mmu = 1;
217
218 return mmu;
196} 219}
197 220
198static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 221static inline int
199{ 222nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
223{
200 u64 mask = *dev->dma_mask; 224 u64 mask = *dev->dma_mask;
201 int high = addr + size > mask; 225 int high = addr + size > mask;
202 int mmu = high; 226 int mmu = high;
203 return mmu; 227
228 return mmu;
204} 229}
205 230
206/* Map a single continuous physical area into the IOMMU. 231/* Map a single continuous physical area into the IOMMU.
@@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
208 */ 233 */
209static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 234static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
210 size_t size, int dir) 235 size_t size, int dir)
211{ 236{
212 unsigned long npages = to_pages(phys_mem, size); 237 unsigned long npages = to_pages(phys_mem, size);
213 unsigned long iommu_page = alloc_iommu(npages); 238 unsigned long iommu_page = alloc_iommu(npages);
214 int i; 239 int i;
240
215 if (iommu_page == -1) { 241 if (iommu_page == -1) {
216 if (!nonforced_iommu(dev, phys_mem, size)) 242 if (!nonforced_iommu(dev, phys_mem, size))
217 return phys_mem; 243 return phys_mem;
218 if (panic_on_overflow) 244 if (panic_on_overflow)
219 panic("dma_map_area overflow %lu bytes\n", size); 245 panic("dma_map_area overflow %lu bytes\n", size);
220 iommu_full(dev, size, dir); 246 iommu_full(dev, size, dir);
@@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
229 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 255 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
230} 256}
231 257
232static dma_addr_t gart_map_simple(struct device *dev, char *buf, 258static dma_addr_t
233 size_t size, int dir) 259gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
234{ 260{
235 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); 261 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
262
236 flush_gart(); 263 flush_gart();
264
237 return map; 265 return map;
238} 266}
239 267
240/* Map a single area into the IOMMU */ 268/* Map a single area into the IOMMU */
241static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) 269static dma_addr_t
270gart_map_single(struct device *dev, void *addr, size_t size, int dir)
242{ 271{
243 unsigned long phys_mem, bus; 272 unsigned long phys_mem, bus;
244 273
245 if (!dev) 274 if (!dev)
246 dev = &fallback_dev; 275 dev = &fallback_dev;
247 276
248 phys_mem = virt_to_phys(addr); 277 phys_mem = virt_to_phys(addr);
249 if (!need_iommu(dev, phys_mem, size)) 278 if (!need_iommu(dev, phys_mem, size))
250 return phys_mem; 279 return phys_mem;
251 280
252 bus = gart_map_simple(dev, addr, size, dir); 281 bus = gart_map_simple(dev, addr, size, dir);
253 return bus; 282
283 return bus;
254} 284}
255 285
256/* 286/*
257 * Free a DMA mapping. 287 * Free a DMA mapping.
258 */ 288 */
259static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, 289static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
260 size_t size, int direction) 290 size_t size, int direction)
261{ 291{
262 unsigned long iommu_page; 292 unsigned long iommu_page;
263 int npages; 293 int npages;
@@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
266 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 296 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
267 dma_addr >= iommu_bus_base + iommu_size) 297 dma_addr >= iommu_bus_base + iommu_size)
268 return; 298 return;
299
269 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 300 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
270 npages = to_pages(dma_addr, size); 301 npages = to_pages(dma_addr, size);
271 for (i = 0; i < npages; i++) { 302 for (i = 0; i < npages; i++) {
@@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
278/* 309/*
279 * Wrapper for pci_unmap_single working with scatterlists. 310 * Wrapper for pci_unmap_single working with scatterlists.
280 */ 311 */
281static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 312static void
313gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
282{ 314{
283 struct scatterlist *s; 315 struct scatterlist *s;
284 int i; 316 int i;
@@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
303 335
304 for_each_sg(sg, s, nents, i) { 336 for_each_sg(sg, s, nents, i) {
305 unsigned long addr = sg_phys(s); 337 unsigned long addr = sg_phys(s);
306 if (nonforced_iommu(dev, addr, s->length)) { 338
339 if (nonforced_iommu(dev, addr, s->length)) {
307 addr = dma_map_area(dev, addr, s->length, dir); 340 addr = dma_map_area(dev, addr, s->length, dir);
308 if (addr == bad_dma_address) { 341 if (addr == bad_dma_address) {
309 if (i > 0) 342 if (i > 0)
310 gart_unmap_sg(dev, sg, i, dir); 343 gart_unmap_sg(dev, sg, i, dir);
311 nents = 0; 344 nents = 0;
312 sg[0].dma_length = 0; 345 sg[0].dma_length = 0;
313 break; 346 break;
314 } 347 }
@@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
317 s->dma_length = s->length; 350 s->dma_length = s->length;
318 } 351 }
319 flush_gart(); 352 flush_gart();
353
320 return nents; 354 return nents;
321} 355}
322 356
323/* Map multiple scatterlist entries continuous into the first. */ 357/* Map multiple scatterlist entries continuous into the first. */
324static int __dma_map_cont(struct scatterlist *start, int nelems, 358static int __dma_map_cont(struct scatterlist *start, int nelems,
325 struct scatterlist *sout, unsigned long pages) 359 struct scatterlist *sout, unsigned long pages)
326{ 360{
327 unsigned long iommu_start = alloc_iommu(pages); 361 unsigned long iommu_start = alloc_iommu(pages);
328 unsigned long iommu_page = iommu_start; 362 unsigned long iommu_page = iommu_start;
329 struct scatterlist *s; 363 struct scatterlist *s;
330 int i; 364 int i;
331 365
@@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
335 for_each_sg(start, s, nelems, i) { 369 for_each_sg(start, s, nelems, i) {
336 unsigned long pages, addr; 370 unsigned long pages, addr;
337 unsigned long phys_addr = s->dma_address; 371 unsigned long phys_addr = s->dma_address;
338 372
339 BUG_ON(s != start && s->offset); 373 BUG_ON(s != start && s->offset);
340 if (s == start) { 374 if (s == start) {
341 sout->dma_address = iommu_bus_base; 375 sout->dma_address = iommu_bus_base;
342 sout->dma_address += iommu_page*PAGE_SIZE + s->offset; 376 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
343 sout->dma_length = s->length; 377 sout->dma_length = s->length;
344 } else { 378 } else {
345 sout->dma_length += s->length; 379 sout->dma_length += s->length;
346 } 380 }
347 381
348 addr = phys_addr; 382 addr = phys_addr;
349 pages = to_pages(s->offset, s->length); 383 pages = to_pages(s->offset, s->length);
350 while (pages--) { 384 while (pages--) {
351 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 385 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
352 SET_LEAK(iommu_page); 386 SET_LEAK(iommu_page);
353 addr += PAGE_SIZE; 387 addr += PAGE_SIZE;
354 iommu_page++; 388 iommu_page++;
355 } 389 }
356 } 390 }
357 BUG_ON(iommu_page - iommu_start != pages); 391 BUG_ON(iommu_page - iommu_start != pages);
392
358 return 0; 393 return 0;
359} 394}
360 395
361static inline int dma_map_cont(struct scatterlist *start, int nelems, 396static inline int
362 struct scatterlist *sout, 397dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout,
363 unsigned long pages, int need) 398 unsigned long pages, int need)
364{ 399{
365 if (!need) { 400 if (!need) {
366 BUG_ON(nelems != 1); 401 BUG_ON(nelems != 1);
@@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
370 } 405 }
371 return __dma_map_cont(start, nelems, sout, pages); 406 return __dma_map_cont(start, nelems, sout, pages);
372} 407}
373 408
374/* 409/*
375 * DMA map all entries in a scatterlist. 410 * DMA map all entries in a scatterlist.
376 * Merge chunks that have page aligned sizes into a continuous mapping. 411 * Merge chunks that have page aligned sizes into a continuous mapping.
377 */ 412 */
378static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, 413static int
379 int dir) 414gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
380{ 415{
381 int i;
382 int out;
383 int start;
384 unsigned long pages = 0;
385 int need = 0, nextneed;
386 struct scatterlist *s, *ps, *start_sg, *sgmap; 416 struct scatterlist *s, *ps, *start_sg, *sgmap;
417 int need = 0, nextneed, i, out, start;
418 unsigned long pages = 0;
387 419
388 if (nents == 0) 420 if (nents == 0)
389 return 0; 421 return 0;
390 422
391 if (!dev) 423 if (!dev)
@@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
397 ps = NULL; /* shut up gcc */ 429 ps = NULL; /* shut up gcc */
398 for_each_sg(sg, s, nents, i) { 430 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 431 dma_addr_t addr = sg_phys(s);
432
400 s->dma_address = addr; 433 s->dma_address = addr;
401 BUG_ON(s->length == 0); 434 BUG_ON(s->length == 0);
402 435
403 nextneed = need_iommu(dev, addr, s->length); 436 nextneed = need_iommu(dev, addr, s->length);
404 437
405 /* Handle the previous not yet processed entries */ 438 /* Handle the previous not yet processed entries */
406 if (i > start) { 439 if (i > start) {
407 /* Can only merge when the last chunk ends on a page 440 /*
408 boundary and the new one doesn't have an offset. */ 441 * Can only merge when the last chunk ends on a
442 * page boundary and the new one doesn't have an
443 * offset.
444 */
409 if (!iommu_merge || !nextneed || !need || s->offset || 445 if (!iommu_merge || !nextneed || !need || s->offset ||
410 (ps->offset + ps->length) % PAGE_SIZE) { 446 (ps->offset + ps->length) % PAGE_SIZE) {
411 if (dma_map_cont(start_sg, i - start, sgmap, 447 if (dma_map_cont(start_sg, i - start, sgmap,
@@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
436error: 472error:
437 flush_gart(); 473 flush_gart();
438 gart_unmap_sg(dev, sg, out, dir); 474 gart_unmap_sg(dev, sg, out, dir);
475
439 /* When it was forced or merged try again in a dumb way */ 476 /* When it was forced or merged try again in a dumb way */
440 if (force_iommu || iommu_merge) { 477 if (force_iommu || iommu_merge) {
441 out = dma_map_sg_nonforce(dev, sg, nents, dir); 478 out = dma_map_sg_nonforce(dev, sg, nents, dir);
@@ -444,64 +481,68 @@ error:
444 } 481 }
445 if (panic_on_overflow) 482 if (panic_on_overflow)
446 panic("dma_map_sg: overflow on %lu pages\n", pages); 483 panic("dma_map_sg: overflow on %lu pages\n", pages);
484
447 iommu_full(dev, pages << PAGE_SHIFT, dir); 485 iommu_full(dev, pages << PAGE_SHIFT, dir);
448 for_each_sg(sg, s, nents, i) 486 for_each_sg(sg, s, nents, i)
449 s->dma_address = bad_dma_address; 487 s->dma_address = bad_dma_address;
450 return 0; 488 return 0;
451} 489}
452 490
453static int no_agp; 491static int no_agp;
454 492
455static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 493static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
456{ 494{
457 unsigned long a; 495 unsigned long a;
458 if (!iommu_size) { 496
459 iommu_size = aper_size; 497 if (!iommu_size) {
460 if (!no_agp) 498 iommu_size = aper_size;
461 iommu_size /= 2; 499 if (!no_agp)
462 } 500 iommu_size /= 2;
463 501 }
464 a = aper + iommu_size; 502
503 a = aper + iommu_size;
465 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; 504 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
466 505
467 if (iommu_size < 64*1024*1024) 506 if (iommu_size < 64*1024*1024) {
468 printk(KERN_WARNING 507 printk(KERN_WARNING
469 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 508 "PCI-DMA: Warning: Small IOMMU %luMB."
470 509 " Consider increasing the AGP aperture in BIOS\n",
510 iommu_size >> 20);
511 }
512
471 return iommu_size; 513 return iommu_size;
472} 514}
473 515
474static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 516static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
475{ 517{
476 unsigned aper_size = 0, aper_base_32; 518 unsigned aper_size = 0, aper_base_32, aper_order;
477 u64 aper_base; 519 u64 aper_base;
478 unsigned aper_order;
479 520
480 pci_read_config_dword(dev, 0x94, &aper_base_32); 521 pci_read_config_dword(dev, 0x94, &aper_base_32);
481 pci_read_config_dword(dev, 0x90, &aper_order); 522 pci_read_config_dword(dev, 0x90, &aper_order);
482 aper_order = (aper_order >> 1) & 7; 523 aper_order = (aper_order >> 1) & 7;
483 524
484 aper_base = aper_base_32 & 0x7fff; 525 aper_base = aper_base_32 & 0x7fff;
485 aper_base <<= 25; 526 aper_base <<= 25;
486 527
487 aper_size = (32 * 1024 * 1024) << aper_order; 528 aper_size = (32 * 1024 * 1024) << aper_order;
488 if (aper_base + aper_size > 0x100000000UL || !aper_size) 529 if (aper_base + aper_size > 0x100000000UL || !aper_size)
489 aper_base = 0; 530 aper_base = 0;
490 531
491 *size = aper_size; 532 *size = aper_size;
492 return aper_base; 533 return aper_base;
493} 534}
494 535
495/* 536/*
496 * Private Northbridge GATT initialization in case we cannot use the 537 * Private Northbridge GATT initialization in case we cannot use the
497 * AGP driver for some reason. 538 * AGP driver for some reason.
498 */ 539 */
499static __init int init_k8_gatt(struct agp_kern_info *info) 540static __init int init_k8_gatt(struct agp_kern_info *info)
500{ 541{
542 unsigned aper_size, gatt_size, new_aper_size;
543 unsigned aper_base, new_aper_base;
501 struct pci_dev *dev; 544 struct pci_dev *dev;
502 void *gatt; 545 void *gatt;
503 unsigned aper_base, new_aper_base;
504 unsigned aper_size, gatt_size, new_aper_size;
505 int i; 546 int i;
506 547
507 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 548 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
@@ -509,75 +550,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
509 dev = NULL; 550 dev = NULL;
510 for (i = 0; i < num_k8_northbridges; i++) { 551 for (i = 0; i < num_k8_northbridges; i++) {
511 dev = k8_northbridges[i]; 552 dev = k8_northbridges[i];
512 new_aper_base = read_aperture(dev, &new_aper_size); 553 new_aper_base = read_aperture(dev, &new_aper_size);
513 if (!new_aper_base) 554 if (!new_aper_base)
514 goto nommu; 555 goto nommu;
515 556
516 if (!aper_base) { 557 if (!aper_base) {
517 aper_size = new_aper_size; 558 aper_size = new_aper_size;
518 aper_base = new_aper_base; 559 aper_base = new_aper_base;
519 } 560 }
520 if (aper_size != new_aper_size || aper_base != new_aper_base) 561 if (aper_size != new_aper_size || aper_base != new_aper_base)
521 goto nommu; 562 goto nommu;
522 } 563 }
523 if (!aper_base) 564 if (!aper_base)
524 goto nommu; 565 goto nommu;
525 info->aper_base = aper_base; 566 info->aper_base = aper_base;
526 info->aper_size = aper_size>>20; 567 info->aper_size = aper_size >> 20;
527 568
528 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 569 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
529 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 570 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
530 if (!gatt) 571 if (!gatt)
531 panic("Cannot allocate GATT table"); 572 panic("Cannot allocate GATT table");
532 if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) 573 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
533 panic("Could not set GART PTEs to uncacheable pages"); 574 panic("Could not set GART PTEs to uncacheable pages");
534 global_flush_tlb();
535 575
536 memset(gatt, 0, gatt_size); 576 memset(gatt, 0, gatt_size);
537 agp_gatt_table = gatt; 577 agp_gatt_table = gatt;
538 578
539 for (i = 0; i < num_k8_northbridges; i++) { 579 for (i = 0; i < num_k8_northbridges; i++) {
540 u32 ctl; 580 u32 gatt_reg;
541 u32 gatt_reg; 581 u32 ctl;
542 582
543 dev = k8_northbridges[i]; 583 dev = k8_northbridges[i];
544 gatt_reg = __pa(gatt) >> 12; 584 gatt_reg = __pa(gatt) >> 12;
545 gatt_reg <<= 4; 585 gatt_reg <<= 4;
546 pci_write_config_dword(dev, 0x98, gatt_reg); 586 pci_write_config_dword(dev, 0x98, gatt_reg);
547 pci_read_config_dword(dev, 0x90, &ctl); 587 pci_read_config_dword(dev, 0x90, &ctl);
548 588
549 ctl |= 1; 589 ctl |= 1;
550 ctl &= ~((1<<4) | (1<<5)); 590 ctl &= ~((1<<4) | (1<<5));
551 591
552 pci_write_config_dword(dev, 0x90, ctl); 592 pci_write_config_dword(dev, 0x90, ctl);
553 } 593 }
554 flush_gart(); 594 flush_gart();
555 595
556 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 596 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
597 aper_base, aper_size>>10);
557 return 0; 598 return 0;
558 599
559 nommu: 600 nommu:
560 /* Should not happen anymore */ 601 /* Should not happen anymore */
561 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 602 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
562 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); 603 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
563 return -1; 604 return -1;
564} 605}
565 606
566extern int agp_amd64_init(void); 607extern int agp_amd64_init(void);
567 608
568static const struct dma_mapping_ops gart_dma_ops = { 609static const struct dma_mapping_ops gart_dma_ops = {
569 .mapping_error = NULL, 610 .mapping_error = NULL,
570 .map_single = gart_map_single, 611 .map_single = gart_map_single,
571 .map_simple = gart_map_simple, 612 .map_simple = gart_map_simple,
572 .unmap_single = gart_unmap_single, 613 .unmap_single = gart_unmap_single,
573 .sync_single_for_cpu = NULL, 614 .sync_single_for_cpu = NULL,
574 .sync_single_for_device = NULL, 615 .sync_single_for_device = NULL,
575 .sync_single_range_for_cpu = NULL, 616 .sync_single_range_for_cpu = NULL,
576 .sync_single_range_for_device = NULL, 617 .sync_single_range_for_device = NULL,
577 .sync_sg_for_cpu = NULL, 618 .sync_sg_for_cpu = NULL,
578 .sync_sg_for_device = NULL, 619 .sync_sg_for_device = NULL,
579 .map_sg = gart_map_sg, 620 .map_sg = gart_map_sg,
580 .unmap_sg = gart_unmap_sg, 621 .unmap_sg = gart_unmap_sg,
581}; 622};
582 623
583void gart_iommu_shutdown(void) 624void gart_iommu_shutdown(void)
@@ -588,23 +629,23 @@ void gart_iommu_shutdown(void)
588 if (no_agp && (dma_ops != &gart_dma_ops)) 629 if (no_agp && (dma_ops != &gart_dma_ops))
589 return; 630 return;
590 631
591 for (i = 0; i < num_k8_northbridges; i++) { 632 for (i = 0; i < num_k8_northbridges; i++) {
592 u32 ctl; 633 u32 ctl;
593 634
594 dev = k8_northbridges[i]; 635 dev = k8_northbridges[i];
595 pci_read_config_dword(dev, 0x90, &ctl); 636 pci_read_config_dword(dev, 0x90, &ctl);
596 637
597 ctl &= ~1; 638 ctl &= ~1;
598 639
599 pci_write_config_dword(dev, 0x90, ctl); 640 pci_write_config_dword(dev, 0x90, ctl);
600 } 641 }
601} 642}
602 643
603void __init gart_iommu_init(void) 644void __init gart_iommu_init(void)
604{ 645{
605 struct agp_kern_info info; 646 struct agp_kern_info info;
606 unsigned long aper_size;
607 unsigned long iommu_start; 647 unsigned long iommu_start;
648 unsigned long aper_size;
608 unsigned long scratch; 649 unsigned long scratch;
609 long i; 650 long i;
610 651
@@ -614,14 +655,14 @@ void __init gart_iommu_init(void)
614 } 655 }
615 656
616#ifndef CONFIG_AGP_AMD64 657#ifndef CONFIG_AGP_AMD64
617 no_agp = 1; 658 no_agp = 1;
618#else 659#else
619 /* Makefile puts PCI initialization via subsys_initcall first. */ 660 /* Makefile puts PCI initialization via subsys_initcall first. */
620 /* Add other K8 AGP bridge drivers here */ 661 /* Add other K8 AGP bridge drivers here */
621 no_agp = no_agp || 662 no_agp = no_agp ||
622 (agp_amd64_init() < 0) || 663 (agp_amd64_init() < 0) ||
623 (agp_copy_info(agp_bridge, &info) < 0); 664 (agp_copy_info(agp_bridge, &info) < 0);
624#endif 665#endif
625 666
626 if (swiotlb) 667 if (swiotlb)
627 return; 668 return;
@@ -643,77 +684,78 @@ void __init gart_iommu_init(void)
643 } 684 }
644 685
645 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 686 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
646 aper_size = info.aper_size * 1024 * 1024; 687 aper_size = info.aper_size * 1024 * 1024;
647 iommu_size = check_iommu_size(info.aper_base, aper_size); 688 iommu_size = check_iommu_size(info.aper_base, aper_size);
648 iommu_pages = iommu_size >> PAGE_SHIFT; 689 iommu_pages = iommu_size >> PAGE_SHIFT;
649 690
650 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 691 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
651 get_order(iommu_pages/8)); 692 get_order(iommu_pages/8));
652 if (!iommu_gart_bitmap) 693 if (!iommu_gart_bitmap)
653 panic("Cannot allocate iommu bitmap\n"); 694 panic("Cannot allocate iommu bitmap\n");
654 memset(iommu_gart_bitmap, 0, iommu_pages/8); 695 memset(iommu_gart_bitmap, 0, iommu_pages/8);
655 696
656#ifdef CONFIG_IOMMU_LEAK 697#ifdef CONFIG_IOMMU_LEAK
657 if (leak_trace) { 698 if (leak_trace) {
658 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 699 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
659 get_order(iommu_pages*sizeof(void *))); 700 get_order(iommu_pages*sizeof(void *)));
660 if (iommu_leak_tab) 701 if (iommu_leak_tab)
661 memset(iommu_leak_tab, 0, iommu_pages * 8); 702 memset(iommu_leak_tab, 0, iommu_pages * 8);
662 else 703 else
663 printk("PCI-DMA: Cannot allocate leak trace area\n"); 704 printk(KERN_DEBUG
664 } 705 "PCI-DMA: Cannot allocate leak trace area\n");
706 }
665#endif 707#endif
666 708
667 /* 709 /*
668 * Out of IOMMU space handling. 710 * Out of IOMMU space handling.
669 * Reserve some invalid pages at the beginning of the GART. 711 * Reserve some invalid pages at the beginning of the GART.
670 */ 712 */
671 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 713 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
672 714
673 agp_memory_reserved = iommu_size; 715 agp_memory_reserved = iommu_size;
674 printk(KERN_INFO 716 printk(KERN_INFO
675 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", 717 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
676 iommu_size>>20); 718 iommu_size >> 20);
677 719
678 iommu_start = aper_size - iommu_size; 720 iommu_start = aper_size - iommu_size;
679 iommu_bus_base = info.aper_base + iommu_start; 721 iommu_bus_base = info.aper_base + iommu_start;
680 bad_dma_address = iommu_bus_base; 722 bad_dma_address = iommu_bus_base;
681 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 723 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
682 724
683 /* 725 /*
684 * Unmap the IOMMU part of the GART. The alias of the page is 726 * Unmap the IOMMU part of the GART. The alias of the page is
685 * always mapped with cache enabled and there is no full cache 727 * always mapped with cache enabled and there is no full cache
686 * coherency across the GART remapping. The unmapping avoids 728 * coherency across the GART remapping. The unmapping avoids
687 * automatic prefetches from the CPU allocating cache lines in 729 * automatic prefetches from the CPU allocating cache lines in
688 * there. All CPU accesses are done via the direct mapping to 730 * there. All CPU accesses are done via the direct mapping to
689 * the backing memory. The GART address is only used by PCI 731 * the backing memory. The GART address is only used by PCI
690 * devices. 732 * devices.
691 */ 733 */
692 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); 734 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
693 735
694 /* 736 /*
695 * Try to workaround a bug (thanks to BenH) 737 * Try to workaround a bug (thanks to BenH)
696 * Set unmapped entries to a scratch page instead of 0. 738 * Set unmapped entries to a scratch page instead of 0.
697 * Any prefetches that hit unmapped entries won't get an bus abort 739 * Any prefetches that hit unmapped entries won't get an bus abort
698 * then. 740 * then.
699 */ 741 */
700 scratch = get_zeroed_page(GFP_KERNEL); 742 scratch = get_zeroed_page(GFP_KERNEL);
701 if (!scratch) 743 if (!scratch)
702 panic("Cannot allocate iommu scratch page"); 744 panic("Cannot allocate iommu scratch page");
703 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); 745 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
704 for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 746 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
705 iommu_gatt_base[i] = gart_unmapped_entry; 747 iommu_gatt_base[i] = gart_unmapped_entry;
706 748
707 flush_gart(); 749 flush_gart();
708 dma_ops = &gart_dma_ops; 750 dma_ops = &gart_dma_ops;
709} 751}
710 752
711void __init gart_parse_options(char *p) 753void __init gart_parse_options(char *p)
712{ 754{
713 int arg; 755 int arg;
714 756
715#ifdef CONFIG_IOMMU_LEAK 757#ifdef CONFIG_IOMMU_LEAK
716 if (!strncmp(p,"leak",4)) { 758 if (!strncmp(p, "leak", 4)) {
717 leak_trace = 1; 759 leak_trace = 1;
718 p += 4; 760 p += 4;
719 if (*p == '=') ++p; 761 if (*p == '=') ++p;
@@ -723,18 +765,18 @@ void __init gart_parse_options(char *p)
723#endif 765#endif
724 if (isdigit(*p) && get_option(&p, &arg)) 766 if (isdigit(*p) && get_option(&p, &arg))
725 iommu_size = arg; 767 iommu_size = arg;
726 if (!strncmp(p, "fullflush",8)) 768 if (!strncmp(p, "fullflush", 8))
727 iommu_fullflush = 1; 769 iommu_fullflush = 1;
728 if (!strncmp(p, "nofullflush",11)) 770 if (!strncmp(p, "nofullflush", 11))
729 iommu_fullflush = 0; 771 iommu_fullflush = 0;
730 if (!strncmp(p,"noagp",5)) 772 if (!strncmp(p, "noagp", 5))
731 no_agp = 1; 773 no_agp = 1;
732 if (!strncmp(p, "noaperture",10)) 774 if (!strncmp(p, "noaperture", 10))
733 fix_aperture = 0; 775 fix_aperture = 0;
734 /* duplicated from pci-dma.c */ 776 /* duplicated from pci-dma.c */
735 if (!strncmp(p,"force",5)) 777 if (!strncmp(p, "force", 5))
736 gart_iommu_aperture_allowed = 1; 778 gart_iommu_aperture_allowed = 1;
737 if (!strncmp(p,"allowed",7)) 779 if (!strncmp(p, "allowed", 7))
738 gart_iommu_aperture_allowed = 1; 780 gart_iommu_aperture_allowed = 1;
739 if (!strncmp(p, "memaper", 7)) { 781 if (!strncmp(p, "memaper", 7)) {
740 fallback_aper_force = 1; 782 fallback_aper_force = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 102866d729a5..82a0a674a003 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -10,7 +10,6 @@
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
12int swiotlb __read_mostly; 12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb);
14 13
15const struct dma_mapping_ops swiotlb_dma_ops = { 14const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error, 15 .mapping_error = swiotlb_dma_mapping_error,
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
index ae8f91214f15..b112406f1996 100644
--- a/arch/x86/kernel/pmtimer_64.c
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -19,13 +19,13 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/cpumask.h> 21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
22#include <asm/io.h> 24#include <asm/io.h>
23#include <asm/proto.h> 25#include <asm/proto.h>
24#include <asm/msr.h> 26#include <asm/msr.h>
25#include <asm/vsyscall.h> 27#include <asm/vsyscall.h>
26 28
27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
28
29static inline u32 cyc2us(u32 cycles) 29static inline u32 cyc2us(u32 cycles)
30{ 30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond. 31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 46d391d49de8..968371ab223a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
55 55
56#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
57#include <asm/cpu.h> 57#include <asm/cpu.h>
58#include <asm/kdebug.h>
58 59
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 61
@@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
74 */ 75 */
75unsigned long thread_saved_pc(struct task_struct *tsk) 76unsigned long thread_saved_pc(struct task_struct *tsk)
76{ 77{
77 return ((unsigned long *)tsk->thread.esp)[3]; 78 return ((unsigned long *)tsk->thread.sp)[3];
78} 79}
79 80
80/* 81/*
@@ -113,10 +114,19 @@ void default_idle(void)
113 smp_mb(); 114 smp_mb();
114 115
115 local_irq_disable(); 116 local_irq_disable();
116 if (!need_resched()) 117 if (!need_resched()) {
118 ktime_t t0, t1;
119 u64 t0n, t1n;
120
121 t0 = ktime_get();
122 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */ 123 safe_halt(); /* enables interrupts racelessly */
118 else 124 local_irq_disable();
119 local_irq_enable(); 125 t1 = ktime_get();
126 t1n = ktime_to_ns(t1);
127 sched_clock_idle_wakeup_event(t1n - t0n);
128 }
129 local_irq_enable();
120 current_thread_info()->status |= TS_POLLING; 130 current_thread_info()->status |= TS_POLLING;
121 } else { 131 } else {
122 /* loop is done by the caller */ 132 /* loop is done by the caller */
@@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle);
132 * to poll the ->work.need_resched flag instead of waiting for the 142 * to poll the ->work.need_resched flag instead of waiting for the
133 * cross-CPU IPI to arrive. Use this option with caution. 143 * cross-CPU IPI to arrive. Use this option with caution.
134 */ 144 */
135static void poll_idle (void) 145static void poll_idle(void)
136{ 146{
137 cpu_relax(); 147 cpu_relax();
138} 148}
@@ -188,6 +198,9 @@ void cpu_idle(void)
188 rmb(); 198 rmb();
189 idle = pm_idle; 199 idle = pm_idle;
190 200
201 if (rcu_pending(cpu))
202 rcu_check_callbacks(cpu, 0);
203
191 if (!idle) 204 if (!idle)
192 idle = default_idle; 205 idle = default_idle;
193 206
@@ -255,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
255 * New with Core Duo processors, MWAIT can take some hints based on CPU 268 * New with Core Duo processors, MWAIT can take some hints based on CPU
256 * capability. 269 * capability.
257 */ 270 */
258void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 271void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
259{ 272{
260 if (!need_resched()) { 273 if (!need_resched()) {
261 __monitor((void *)&current_thread_info()->flags, 0, 0); 274 __monitor((void *)&current_thread_info()->flags, 0, 0);
262 smp_mb(); 275 smp_mb();
263 if (!need_resched()) 276 if (!need_resched())
264 __mwait(eax, ecx); 277 __mwait(ax, cx);
265 } 278 }
266} 279}
267 280
@@ -272,19 +285,37 @@ static void mwait_idle(void)
272 mwait_idle_with_hints(0, 0); 285 mwait_idle_with_hints(0, 0);
273} 286}
274 287
288static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
289{
290 if (force_mwait)
291 return 1;
292 /* Any C1 states supported? */
293 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
294}
295
275void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 296void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
276{ 297{
277 if (cpu_has(c, X86_FEATURE_MWAIT)) { 298 static int selected;
278 printk("monitor/mwait feature present.\n"); 299
300 if (selected)
301 return;
302#ifdef CONFIG_X86_SMP
303 if (pm_idle == poll_idle && smp_num_siblings > 1) {
304 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
305 " performance may degrade.\n");
306 }
307#endif
308 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
279 /* 309 /*
280 * Skip, if setup has overridden idle. 310 * Skip, if setup has overridden idle.
281 * One CPU supports mwait => All CPUs supports mwait 311 * One CPU supports mwait => All CPUs supports mwait
282 */ 312 */
283 if (!pm_idle) { 313 if (!pm_idle) {
284 printk("using mwait in idle threads.\n"); 314 printk(KERN_INFO "using mwait in idle threads.\n");
285 pm_idle = mwait_idle; 315 pm_idle = mwait_idle;
286 } 316 }
287 } 317 }
318 selected = 1;
288} 319}
289 320
290static int __init idle_setup(char *str) 321static int __init idle_setup(char *str)
@@ -292,10 +323,6 @@ static int __init idle_setup(char *str)
292 if (!strcmp(str, "poll")) { 323 if (!strcmp(str, "poll")) {
293 printk("using polling idle threads.\n"); 324 printk("using polling idle threads.\n");
294 pm_idle = poll_idle; 325 pm_idle = poll_idle;
295#ifdef CONFIG_X86_SMP
296 if (smp_num_siblings > 1)
297 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
298#endif
299 } else if (!strcmp(str, "mwait")) 326 } else if (!strcmp(str, "mwait"))
300 force_mwait = 1; 327 force_mwait = 1;
301 else 328 else
@@ -310,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all)
310{ 337{
311 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 338 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
312 unsigned long d0, d1, d2, d3, d6, d7; 339 unsigned long d0, d1, d2, d3, d6, d7;
313 unsigned long esp; 340 unsigned long sp;
314 unsigned short ss, gs; 341 unsigned short ss, gs;
315 342
316 if (user_mode_vm(regs)) { 343 if (user_mode_vm(regs)) {
317 esp = regs->esp; 344 sp = regs->sp;
318 ss = regs->xss & 0xffff; 345 ss = regs->ss & 0xffff;
319 savesegment(gs, gs); 346 savesegment(gs, gs);
320 } else { 347 } else {
321 esp = (unsigned long) (&regs->esp); 348 sp = (unsigned long) (&regs->sp);
322 savesegment(ss, ss); 349 savesegment(ss, ss);
323 savesegment(gs, gs); 350 savesegment(gs, gs);
324 } 351 }
@@ -331,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all)
331 init_utsname()->version); 358 init_utsname()->version);
332 359
333 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 360 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
334 0xffff & regs->xcs, regs->eip, regs->eflags, 361 0xffff & regs->cs, regs->ip, regs->flags,
335 smp_processor_id()); 362 smp_processor_id());
336 print_symbol("EIP is at %s\n", regs->eip); 363 print_symbol("EIP is at %s\n", regs->ip);
337 364
338 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 365 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
339 regs->eax, regs->ebx, regs->ecx, regs->edx); 366 regs->ax, regs->bx, regs->cx, regs->dx);
340 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 367 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
341 regs->esi, regs->edi, regs->ebp, esp); 368 regs->si, regs->di, regs->bp, sp);
342 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 369 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
343 regs->xds & 0xffff, regs->xes & 0xffff, 370 regs->ds & 0xffff, regs->es & 0xffff,
344 regs->xfs & 0xffff, gs, ss); 371 regs->fs & 0xffff, gs, ss);
345 372
346 if (!all) 373 if (!all)
347 return; 374 return;
@@ -369,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all)
369void show_regs(struct pt_regs *regs) 396void show_regs(struct pt_regs *regs)
370{ 397{
371 __show_registers(regs, 1); 398 __show_registers(regs, 1);
372 show_trace(NULL, regs, &regs->esp); 399 show_trace(NULL, regs, &regs->sp, regs->bp);
373} 400}
374 401
375/* 402/*
376 * This gets run with %ebx containing the 403 * This gets run with %bx containing the
377 * function to call, and %edx containing 404 * function to call, and %dx containing
378 * the "args". 405 * the "args".
379 */ 406 */
380extern void kernel_thread_helper(void); 407extern void kernel_thread_helper(void);
@@ -388,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
388 415
389 memset(&regs, 0, sizeof(regs)); 416 memset(&regs, 0, sizeof(regs));
390 417
391 regs.ebx = (unsigned long) fn; 418 regs.bx = (unsigned long) fn;
392 regs.edx = (unsigned long) arg; 419 regs.dx = (unsigned long) arg;
393 420
394 regs.xds = __USER_DS; 421 regs.ds = __USER_DS;
395 regs.xes = __USER_DS; 422 regs.es = __USER_DS;
396 regs.xfs = __KERNEL_PERCPU; 423 regs.fs = __KERNEL_PERCPU;
397 regs.orig_eax = -1; 424 regs.orig_ax = -1;
398 regs.eip = (unsigned long) kernel_thread_helper; 425 regs.ip = (unsigned long) kernel_thread_helper;
399 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 426 regs.cs = __KERNEL_CS | get_kernel_rpl();
400 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 427 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
401 428
402 /* Ok, create the new process.. */ 429 /* Ok, create the new process.. */
403 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 430 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -435,7 +462,12 @@ void flush_thread(void)
435{ 462{
436 struct task_struct *tsk = current; 463 struct task_struct *tsk = current;
437 464
438 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); 465 tsk->thread.debugreg0 = 0;
466 tsk->thread.debugreg1 = 0;
467 tsk->thread.debugreg2 = 0;
468 tsk->thread.debugreg3 = 0;
469 tsk->thread.debugreg6 = 0;
470 tsk->thread.debugreg7 = 0;
439 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 471 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
440 clear_tsk_thread_flag(tsk, TIF_DEBUG); 472 clear_tsk_thread_flag(tsk, TIF_DEBUG);
441 /* 473 /*
@@ -460,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk)
460 unlazy_fpu(tsk); 492 unlazy_fpu(tsk);
461} 493}
462 494
463int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, 495int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
464 unsigned long unused, 496 unsigned long unused,
465 struct task_struct * p, struct pt_regs * regs) 497 struct task_struct * p, struct pt_regs * regs)
466{ 498{
@@ -470,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
470 502
471 childregs = task_pt_regs(p); 503 childregs = task_pt_regs(p);
472 *childregs = *regs; 504 *childregs = *regs;
473 childregs->eax = 0; 505 childregs->ax = 0;
474 childregs->esp = esp; 506 childregs->sp = sp;
475 507
476 p->thread.esp = (unsigned long) childregs; 508 p->thread.sp = (unsigned long) childregs;
477 p->thread.esp0 = (unsigned long) (childregs+1); 509 p->thread.sp0 = (unsigned long) (childregs+1);
478 510
479 p->thread.eip = (unsigned long) ret_from_fork; 511 p->thread.ip = (unsigned long) ret_from_fork;
480 512
481 savesegment(gs,p->thread.gs); 513 savesegment(gs, p->thread.gs);
482 514
483 tsk = current; 515 tsk = current;
484 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 516 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -491,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
491 set_tsk_thread_flag(p, TIF_IO_BITMAP); 523 set_tsk_thread_flag(p, TIF_IO_BITMAP);
492 } 524 }
493 525
526 err = 0;
527
494 /* 528 /*
495 * Set a new TLS for the child thread? 529 * Set a new TLS for the child thread?
496 */ 530 */
497 if (clone_flags & CLONE_SETTLS) { 531 if (clone_flags & CLONE_SETTLS)
498 struct desc_struct *desc; 532 err = do_set_thread_area(p, -1,
499 struct user_desc info; 533 (struct user_desc __user *)childregs->si, 0);
500 int idx;
501
502 err = -EFAULT;
503 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
504 goto out;
505 err = -EINVAL;
506 if (LDT_empty(&info))
507 goto out;
508
509 idx = info.entry_number;
510 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
511 goto out;
512
513 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
514 desc->a = LDT_entry_a(&info);
515 desc->b = LDT_entry_b(&info);
516 }
517 534
518 err = 0;
519 out:
520 if (err && p->thread.io_bitmap_ptr) { 535 if (err && p->thread.io_bitmap_ptr) {
521 kfree(p->thread.io_bitmap_ptr); 536 kfree(p->thread.io_bitmap_ptr);
522 p->thread.io_bitmap_max = 0; 537 p->thread.io_bitmap_max = 0;
@@ -529,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
529 */ 544 */
530void dump_thread(struct pt_regs * regs, struct user * dump) 545void dump_thread(struct pt_regs * regs, struct user * dump)
531{ 546{
532 int i; 547 u16 gs;
533 548
534/* changed the size calculations - should hopefully work better. lbt */ 549/* changed the size calculations - should hopefully work better. lbt */
535 dump->magic = CMAGIC; 550 dump->magic = CMAGIC;
536 dump->start_code = 0; 551 dump->start_code = 0;
537 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); 552 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
538 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 553 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
539 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 554 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
540 dump->u_dsize -= dump->u_tsize; 555 dump->u_dsize -= dump->u_tsize;
541 dump->u_ssize = 0; 556 dump->u_ssize = 0;
542 for (i = 0; i < 8; i++) 557 dump->u_debugreg[0] = current->thread.debugreg0;
543 dump->u_debugreg[i] = current->thread.debugreg[i]; 558 dump->u_debugreg[1] = current->thread.debugreg1;
559 dump->u_debugreg[2] = current->thread.debugreg2;
560 dump->u_debugreg[3] = current->thread.debugreg3;
561 dump->u_debugreg[4] = 0;
562 dump->u_debugreg[5] = 0;
563 dump->u_debugreg[6] = current->thread.debugreg6;
564 dump->u_debugreg[7] = current->thread.debugreg7;
544 565
545 if (dump->start_stack < TASK_SIZE) 566 if (dump->start_stack < TASK_SIZE)
546 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; 567 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
547 568
548 dump->regs.ebx = regs->ebx; 569 dump->regs.bx = regs->bx;
549 dump->regs.ecx = regs->ecx; 570 dump->regs.cx = regs->cx;
550 dump->regs.edx = regs->edx; 571 dump->regs.dx = regs->dx;
551 dump->regs.esi = regs->esi; 572 dump->regs.si = regs->si;
552 dump->regs.edi = regs->edi; 573 dump->regs.di = regs->di;
553 dump->regs.ebp = regs->ebp; 574 dump->regs.bp = regs->bp;
554 dump->regs.eax = regs->eax; 575 dump->regs.ax = regs->ax;
555 dump->regs.ds = regs->xds; 576 dump->regs.ds = (u16)regs->ds;
556 dump->regs.es = regs->xes; 577 dump->regs.es = (u16)regs->es;
557 dump->regs.fs = regs->xfs; 578 dump->regs.fs = (u16)regs->fs;
558 savesegment(gs,dump->regs.gs); 579 savesegment(gs,gs);
559 dump->regs.orig_eax = regs->orig_eax; 580 dump->regs.orig_ax = regs->orig_ax;
560 dump->regs.eip = regs->eip; 581 dump->regs.ip = regs->ip;
561 dump->regs.cs = regs->xcs; 582 dump->regs.cs = (u16)regs->cs;
562 dump->regs.eflags = regs->eflags; 583 dump->regs.flags = regs->flags;
563 dump->regs.esp = regs->esp; 584 dump->regs.sp = regs->sp;
564 dump->regs.ss = regs->xss; 585 dump->regs.ss = (u16)regs->ss;
565 586
566 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 587 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
567} 588}
568EXPORT_SYMBOL(dump_thread); 589EXPORT_SYMBOL(dump_thread);
569 590
570/*
571 * Capture the user space registers if the task is not running (in user space)
572 */
573int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
574{
575 struct pt_regs ptregs = *task_pt_regs(tsk);
576 ptregs.xcs &= 0xffff;
577 ptregs.xds &= 0xffff;
578 ptregs.xes &= 0xffff;
579 ptregs.xss &= 0xffff;
580
581 elf_core_copy_regs(regs, &ptregs);
582
583 return 1;
584}
585
586#ifdef CONFIG_SECCOMP 591#ifdef CONFIG_SECCOMP
587void hard_disable_TSC(void) 592static void hard_disable_TSC(void)
588{ 593{
589 write_cr4(read_cr4() | X86_CR4_TSD); 594 write_cr4(read_cr4() | X86_CR4_TSD);
590} 595}
@@ -599,7 +604,7 @@ void disable_TSC(void)
599 hard_disable_TSC(); 604 hard_disable_TSC();
600 preempt_enable(); 605 preempt_enable();
601} 606}
602void hard_enable_TSC(void) 607static void hard_enable_TSC(void)
603{ 608{
604 write_cr4(read_cr4() & ~X86_CR4_TSD); 609 write_cr4(read_cr4() & ~X86_CR4_TSD);
605} 610}
@@ -609,18 +614,32 @@ static noinline void
609__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 614__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
610 struct tss_struct *tss) 615 struct tss_struct *tss)
611{ 616{
612 struct thread_struct *next; 617 struct thread_struct *prev, *next;
618 unsigned long debugctl;
613 619
620 prev = &prev_p->thread;
614 next = &next_p->thread; 621 next = &next_p->thread;
615 622
623 debugctl = prev->debugctlmsr;
624 if (next->ds_area_msr != prev->ds_area_msr) {
625 /* we clear debugctl to make sure DS
626 * is not in use when we change it */
627 debugctl = 0;
628 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
629 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
630 }
631
632 if (next->debugctlmsr != debugctl)
633 wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
634
616 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 635 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
617 set_debugreg(next->debugreg[0], 0); 636 set_debugreg(next->debugreg0, 0);
618 set_debugreg(next->debugreg[1], 1); 637 set_debugreg(next->debugreg1, 1);
619 set_debugreg(next->debugreg[2], 2); 638 set_debugreg(next->debugreg2, 2);
620 set_debugreg(next->debugreg[3], 3); 639 set_debugreg(next->debugreg3, 3);
621 /* no 4 and 5 */ 640 /* no 4 and 5 */
622 set_debugreg(next->debugreg[6], 6); 641 set_debugreg(next->debugreg6, 6);
623 set_debugreg(next->debugreg[7], 7); 642 set_debugreg(next->debugreg7, 7);
624 } 643 }
625 644
626#ifdef CONFIG_SECCOMP 645#ifdef CONFIG_SECCOMP
@@ -634,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
634 } 653 }
635#endif 654#endif
636 655
656 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
657 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
658
659 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
660 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
661
662
637 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 663 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
638 /* 664 /*
639 * Disable the bitmap via an invalid offset. We still cache 665 * Disable the bitmap via an invalid offset. We still cache
@@ -687,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
687 * More important, however, is the fact that this allows us much 713 * More important, however, is the fact that this allows us much
688 * more flexibility. 714 * more flexibility.
689 * 715 *
690 * The return value (in %eax) will be the "prev" task after 716 * The return value (in %ax) will be the "prev" task after
691 * the task-switch, and shows up in ret_from_fork in entry.S, 717 * the task-switch, and shows up in ret_from_fork in entry.S,
692 * for example. 718 * for example.
693 */ 719 */
694struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 720struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
695{ 721{
696 struct thread_struct *prev = &prev_p->thread, 722 struct thread_struct *prev = &prev_p->thread,
697 *next = &next_p->thread; 723 *next = &next_p->thread;
@@ -710,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
710 /* 736 /*
711 * Reload esp0. 737 * Reload esp0.
712 */ 738 */
713 load_esp0(tss, next); 739 load_sp0(tss, next);
714 740
715 /* 741 /*
716 * Save away %gs. No need to save %fs, as it was saved on the 742 * Save away %gs. No need to save %fs, as it was saved on the
@@ -774,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
774 800
775asmlinkage int sys_fork(struct pt_regs regs) 801asmlinkage int sys_fork(struct pt_regs regs)
776{ 802{
777 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 803 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
778} 804}
779 805
780asmlinkage int sys_clone(struct pt_regs regs) 806asmlinkage int sys_clone(struct pt_regs regs)
@@ -783,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
783 unsigned long newsp; 809 unsigned long newsp;
784 int __user *parent_tidptr, *child_tidptr; 810 int __user *parent_tidptr, *child_tidptr;
785 811
786 clone_flags = regs.ebx; 812 clone_flags = regs.bx;
787 newsp = regs.ecx; 813 newsp = regs.cx;
788 parent_tidptr = (int __user *)regs.edx; 814 parent_tidptr = (int __user *)regs.dx;
789 child_tidptr = (int __user *)regs.edi; 815 child_tidptr = (int __user *)regs.di;
790 if (!newsp) 816 if (!newsp)
791 newsp = regs.esp; 817 newsp = regs.sp;
792 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 818 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
793} 819}
794 820
@@ -804,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
804 */ 830 */
805asmlinkage int sys_vfork(struct pt_regs regs) 831asmlinkage int sys_vfork(struct pt_regs regs)
806{ 832{
807 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 833 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
808} 834}
809 835
810/* 836/*
@@ -815,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs)
815 int error; 841 int error;
816 char * filename; 842 char * filename;
817 843
818 filename = getname((char __user *) regs.ebx); 844 filename = getname((char __user *) regs.bx);
819 error = PTR_ERR(filename); 845 error = PTR_ERR(filename);
820 if (IS_ERR(filename)) 846 if (IS_ERR(filename))
821 goto out; 847 goto out;
822 error = do_execve(filename, 848 error = do_execve(filename,
823 (char __user * __user *) regs.ecx, 849 (char __user * __user *) regs.cx,
824 (char __user * __user *) regs.edx, 850 (char __user * __user *) regs.dx,
825 &regs); 851 &regs);
826 if (error == 0) { 852 if (error == 0) {
827 task_lock(current);
828 current->ptrace &= ~PT_DTRACE;
829 task_unlock(current);
830 /* Make sure we don't return using sysenter.. */ 853 /* Make sure we don't return using sysenter.. */
831 set_thread_flag(TIF_IRET); 854 set_thread_flag(TIF_IRET);
832 } 855 }
@@ -840,145 +863,37 @@ out:
840 863
841unsigned long get_wchan(struct task_struct *p) 864unsigned long get_wchan(struct task_struct *p)
842{ 865{
843 unsigned long ebp, esp, eip; 866 unsigned long bp, sp, ip;
844 unsigned long stack_page; 867 unsigned long stack_page;
845 int count = 0; 868 int count = 0;
846 if (!p || p == current || p->state == TASK_RUNNING) 869 if (!p || p == current || p->state == TASK_RUNNING)
847 return 0; 870 return 0;
848 stack_page = (unsigned long)task_stack_page(p); 871 stack_page = (unsigned long)task_stack_page(p);
849 esp = p->thread.esp; 872 sp = p->thread.sp;
850 if (!stack_page || esp < stack_page || esp > top_esp+stack_page) 873 if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
851 return 0; 874 return 0;
852 /* include/asm-i386/system.h:switch_to() pushes ebp last. */ 875 /* include/asm-i386/system.h:switch_to() pushes bp last. */
853 ebp = *(unsigned long *) esp; 876 bp = *(unsigned long *) sp;
854 do { 877 do {
855 if (ebp < stack_page || ebp > top_ebp+stack_page) 878 if (bp < stack_page || bp > top_ebp+stack_page)
856 return 0; 879 return 0;
857 eip = *(unsigned long *) (ebp+4); 880 ip = *(unsigned long *) (bp+4);
858 if (!in_sched_functions(eip)) 881 if (!in_sched_functions(ip))
859 return eip; 882 return ip;
860 ebp = *(unsigned long *) ebp; 883 bp = *(unsigned long *) bp;
861 } while (count++ < 16); 884 } while (count++ < 16);
862 return 0; 885 return 0;
863} 886}
864 887
865/*
866 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
867 */
868static int get_free_idx(void)
869{
870 struct thread_struct *t = &current->thread;
871 int idx;
872
873 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
874 if (desc_empty(t->tls_array + idx))
875 return idx + GDT_ENTRY_TLS_MIN;
876 return -ESRCH;
877}
878
879/*
880 * Set a given TLS descriptor:
881 */
882asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
883{
884 struct thread_struct *t = &current->thread;
885 struct user_desc info;
886 struct desc_struct *desc;
887 int cpu, idx;
888
889 if (copy_from_user(&info, u_info, sizeof(info)))
890 return -EFAULT;
891 idx = info.entry_number;
892
893 /*
894 * index -1 means the kernel should try to find and
895 * allocate an empty descriptor:
896 */
897 if (idx == -1) {
898 idx = get_free_idx();
899 if (idx < 0)
900 return idx;
901 if (put_user(idx, &u_info->entry_number))
902 return -EFAULT;
903 }
904
905 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
906 return -EINVAL;
907
908 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
909
910 /*
911 * We must not get preempted while modifying the TLS.
912 */
913 cpu = get_cpu();
914
915 if (LDT_empty(&info)) {
916 desc->a = 0;
917 desc->b = 0;
918 } else {
919 desc->a = LDT_entry_a(&info);
920 desc->b = LDT_entry_b(&info);
921 }
922 load_TLS(t, cpu);
923
924 put_cpu();
925
926 return 0;
927}
928
929/*
930 * Get the current Thread-Local Storage area:
931 */
932
933#define GET_BASE(desc) ( \
934 (((desc)->a >> 16) & 0x0000ffff) | \
935 (((desc)->b << 16) & 0x00ff0000) | \
936 ( (desc)->b & 0xff000000) )
937
938#define GET_LIMIT(desc) ( \
939 ((desc)->a & 0x0ffff) | \
940 ((desc)->b & 0xf0000) )
941
942#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
943#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
944#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
945#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
946#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
947#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
948
949asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
950{
951 struct user_desc info;
952 struct desc_struct *desc;
953 int idx;
954
955 if (get_user(idx, &u_info->entry_number))
956 return -EFAULT;
957 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
958 return -EINVAL;
959
960 memset(&info, 0, sizeof(info));
961
962 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
963
964 info.entry_number = idx;
965 info.base_addr = GET_BASE(desc);
966 info.limit = GET_LIMIT(desc);
967 info.seg_32bit = GET_32BIT(desc);
968 info.contents = GET_CONTENTS(desc);
969 info.read_exec_only = !GET_WRITABLE(desc);
970 info.limit_in_pages = GET_LIMIT_PAGES(desc);
971 info.seg_not_present = !GET_PRESENT(desc);
972 info.useable = GET_USEABLE(desc);
973
974 if (copy_to_user(u_info, &info, sizeof(info)))
975 return -EFAULT;
976 return 0;
977}
978
979unsigned long arch_align_stack(unsigned long sp) 888unsigned long arch_align_stack(unsigned long sp)
980{ 889{
981 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 890 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
982 sp -= get_random_int() % 8192; 891 sp -= get_random_int() % 8192;
983 return sp & ~0xf; 892 return sp & ~0xf;
984} 893}
894
895unsigned long arch_randomize_brk(struct mm_struct *mm)
896{
897 unsigned long range_end = mm->brk + 0x02000000;
898 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
899}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ab79e1dfa023..137a86171c39 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Pentium III FXSR, SSE support 4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 * 6 *
7 * X86-64 port 7 * X86-64 port
8 * Andi Kleen. 8 * Andi Kleen.
9 * 9 *
@@ -19,19 +19,19 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/fs.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/mm.h> 24#include <linux/mm.h>
24#include <linux/fs.h>
25#include <linux/elfcore.h> 25#include <linux/elfcore.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/user.h> 28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h> 29#include <linux/a.out.h>
31#include <linux/interrupt.h> 30#include <linux/interrupt.h>
31#include <linux/utsname.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/module.h>
33#include <linux/ptrace.h> 34#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h> 35#include <linux/random.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/kprobes.h> 37#include <linux/kprobes.h>
@@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n)
72{ 72{
73 atomic_notifier_chain_register(&idle_notifier, n); 73 atomic_notifier_chain_register(&idle_notifier, n);
74} 74}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
79 atomic_notifier_chain_unregister(&idle_notifier, n);
80}
81EXPORT_SYMBOL(idle_notifier_unregister);
82 75
83void enter_idle(void) 76void enter_idle(void)
84{ 77{
@@ -106,7 +99,7 @@ void exit_idle(void)
106 * We use this if we don't have any better 99 * We use this if we don't have any better
107 * idle routine.. 100 * idle routine..
108 */ 101 */
109static void default_idle(void) 102void default_idle(void)
110{ 103{
111 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
112 /* 105 /*
@@ -116,11 +109,18 @@ static void default_idle(void)
116 smp_mb(); 109 smp_mb();
117 local_irq_disable(); 110 local_irq_disable();
118 if (!need_resched()) { 111 if (!need_resched()) {
119 /* Enables interrupts one instruction before HLT. 112 ktime_t t0, t1;
120 x86 special cases this so there is no race. */ 113 u64 t0n, t1n;
121 safe_halt(); 114
122 } else 115 t0 = ktime_get();
123 local_irq_enable(); 116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
118 local_irq_disable();
119 t1 = ktime_get();
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
122 }
123 local_irq_enable();
124 current_thread_info()->status |= TS_POLLING; 124 current_thread_info()->status |= TS_POLLING;
125} 125}
126 126
@@ -129,54 +129,12 @@ static void default_idle(void)
129 * to poll the ->need_resched flag instead of waiting for the 129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution. 130 * cross-CPU IPI to arrive. Use this option with caution.
131 */ 131 */
132static void poll_idle (void) 132static void poll_idle(void)
133{ 133{
134 local_irq_enable(); 134 local_irq_enable();
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138static void do_nothing(void *unused)
139{
140}
141
142void cpu_idle_wait(void)
143{
144 unsigned int cpu, this_cpu = get_cpu();
145 cpumask_t map, tmp = current->cpus_allowed;
146
147 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
148 put_cpu();
149
150 cpus_clear(map);
151 for_each_online_cpu(cpu) {
152 per_cpu(cpu_idle_state, cpu) = 1;
153 cpu_set(cpu, map);
154 }
155
156 __get_cpu_var(cpu_idle_state) = 0;
157
158 wmb();
159 do {
160 ssleep(1);
161 for_each_online_cpu(cpu) {
162 if (cpu_isset(cpu, map) &&
163 !per_cpu(cpu_idle_state, cpu))
164 cpu_clear(cpu, map);
165 }
166 cpus_and(map, map, cpu_online_map);
167 /*
168 * We waited 1 sec, if a CPU still did not call idle
169 * it may be because it is in idle and not waking up
170 * because it has nothing to do.
171 * Give all the remaining CPUS a kick.
172 */
173 smp_call_function_mask(map, do_nothing, 0, 0);
174 } while (!cpus_empty(map));
175
176 set_cpus_allowed(current, tmp);
177}
178EXPORT_SYMBOL_GPL(cpu_idle_wait);
179
180#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
181DECLARE_PER_CPU(int, cpu_state); 139DECLARE_PER_CPU(int, cpu_state);
182 140
@@ -207,19 +165,18 @@ static inline void play_dead(void)
207 * low exit latency (ie sit in a loop waiting for 165 * low exit latency (ie sit in a loop waiting for
208 * somebody to say that they'd like to reschedule) 166 * somebody to say that they'd like to reschedule)
209 */ 167 */
210void cpu_idle (void) 168void cpu_idle(void)
211{ 169{
212 current_thread_info()->status |= TS_POLLING; 170 current_thread_info()->status |= TS_POLLING;
213 /* endless idle loop with no priority at all */ 171 /* endless idle loop with no priority at all */
214 while (1) { 172 while (1) {
173 tick_nohz_stop_sched_tick();
215 while (!need_resched()) { 174 while (!need_resched()) {
216 void (*idle)(void); 175 void (*idle)(void);
217 176
218 if (__get_cpu_var(cpu_idle_state)) 177 if (__get_cpu_var(cpu_idle_state))
219 __get_cpu_var(cpu_idle_state) = 0; 178 __get_cpu_var(cpu_idle_state) = 0;
220 179
221 tick_nohz_stop_sched_tick();
222
223 rmb(); 180 rmb();
224 idle = pm_idle; 181 idle = pm_idle;
225 if (!idle) 182 if (!idle)
@@ -247,6 +204,47 @@ void cpu_idle (void)
247 } 204 }
248} 205}
249 206
207static void do_nothing(void *unused)
208{
209}
210
211void cpu_idle_wait(void)
212{
213 unsigned int cpu, this_cpu = get_cpu();
214 cpumask_t map, tmp = current->cpus_allowed;
215
216 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
217 put_cpu();
218
219 cpus_clear(map);
220 for_each_online_cpu(cpu) {
221 per_cpu(cpu_idle_state, cpu) = 1;
222 cpu_set(cpu, map);
223 }
224
225 __get_cpu_var(cpu_idle_state) = 0;
226
227 wmb();
228 do {
229 ssleep(1);
230 for_each_online_cpu(cpu) {
231 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
232 cpu_clear(cpu, map);
233 }
234 cpus_and(map, map, cpu_online_map);
235 /*
236 * We waited 1 sec, if a CPU still did not call idle
237 * it may be because it is in idle and not waking up
238 * because it has nothing to do.
239 * Give all the remaining CPUS a kick.
240 */
241 smp_call_function_mask(map, do_nothing, 0, 0);
242 } while (!cpus_empty(map));
243
244 set_cpus_allowed(current, tmp);
245}
246EXPORT_SYMBOL_GPL(cpu_idle_wait);
247
250/* 248/*
251 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 249 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
252 * which can obviate IPI to trigger checking of need_resched. 250 * which can obviate IPI to trigger checking of need_resched.
@@ -257,13 +255,13 @@ void cpu_idle (void)
257 * New with Core Duo processors, MWAIT can take some hints based on CPU 255 * New with Core Duo processors, MWAIT can take some hints based on CPU
258 * capability. 256 * capability.
259 */ 257 */
260void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 258void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
261{ 259{
262 if (!need_resched()) { 260 if (!need_resched()) {
263 __monitor((void *)&current_thread_info()->flags, 0, 0); 261 __monitor((void *)&current_thread_info()->flags, 0, 0);
264 smp_mb(); 262 smp_mb();
265 if (!need_resched()) 263 if (!need_resched())
266 __mwait(eax, ecx); 264 __mwait(ax, cx);
267 } 265 }
268} 266}
269 267
@@ -282,25 +280,41 @@ static void mwait_idle(void)
282 } 280 }
283} 281}
284 282
283
284static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
285{
286 if (force_mwait)
287 return 1;
288 /* Any C1 states supported? */
289 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
290}
291
285void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 292void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
286{ 293{
287 static int printed; 294 static int selected;
288 if (cpu_has(c, X86_FEATURE_MWAIT)) { 295
296 if (selected)
297 return;
298#ifdef CONFIG_X86_SMP
299 if (pm_idle == poll_idle && smp_num_siblings > 1) {
300 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
301 " performance may degrade.\n");
302 }
303#endif
304 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
289 /* 305 /*
290 * Skip, if setup has overridden idle. 306 * Skip, if setup has overridden idle.
291 * One CPU supports mwait => All CPUs supports mwait 307 * One CPU supports mwait => All CPUs supports mwait
292 */ 308 */
293 if (!pm_idle) { 309 if (!pm_idle) {
294 if (!printed) { 310 printk(KERN_INFO "using mwait in idle threads.\n");
295 printk(KERN_INFO "using mwait in idle threads.\n");
296 printed = 1;
297 }
298 pm_idle = mwait_idle; 311 pm_idle = mwait_idle;
299 } 312 }
300 } 313 }
314 selected = 1;
301} 315}
302 316
303static int __init idle_setup (char *str) 317static int __init idle_setup(char *str)
304{ 318{
305 if (!strcmp(str, "poll")) { 319 if (!strcmp(str, "poll")) {
306 printk("using polling idle threads.\n"); 320 printk("using polling idle threads.\n");
@@ -315,13 +329,13 @@ static int __init idle_setup (char *str)
315} 329}
316early_param("idle", idle_setup); 330early_param("idle", idle_setup);
317 331
318/* Prints also some state that isn't saved in the pt_regs */ 332/* Prints also some state that isn't saved in the pt_regs */
319void __show_regs(struct pt_regs * regs) 333void __show_regs(struct pt_regs * regs)
320{ 334{
321 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 335 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
322 unsigned long d0, d1, d2, d3, d6, d7; 336 unsigned long d0, d1, d2, d3, d6, d7;
323 unsigned int fsindex,gsindex; 337 unsigned int fsindex, gsindex;
324 unsigned int ds,cs,es; 338 unsigned int ds, cs, es;
325 339
326 printk("\n"); 340 printk("\n");
327 print_modules(); 341 print_modules();
@@ -330,16 +344,16 @@ void __show_regs(struct pt_regs * regs)
330 init_utsname()->release, 344 init_utsname()->release,
331 (int)strcspn(init_utsname()->version, " "), 345 (int)strcspn(init_utsname()->version, " "),
332 init_utsname()->version); 346 init_utsname()->version);
333 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 347 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
334 printk_address(regs->rip); 348 printk_address(regs->ip, 1);
335 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, 349 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
336 regs->eflags); 350 regs->flags);
337 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 351 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
338 regs->rax, regs->rbx, regs->rcx); 352 regs->ax, regs->bx, regs->cx);
339 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 353 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
340 regs->rdx, regs->rsi, regs->rdi); 354 regs->dx, regs->si, regs->di);
341 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 355 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
342 regs->rbp, regs->r8, regs->r9); 356 regs->bp, regs->r8, regs->r9);
343 printk("R10: %016lx R11: %016lx R12: %016lx\n", 357 printk("R10: %016lx R11: %016lx R12: %016lx\n",
344 regs->r10, regs->r11, regs->r12); 358 regs->r10, regs->r11, regs->r12);
345 printk("R13: %016lx R14: %016lx R15: %016lx\n", 359 printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -379,7 +393,7 @@ void show_regs(struct pt_regs *regs)
379{ 393{
380 printk("CPU %d:", smp_processor_id()); 394 printk("CPU %d:", smp_processor_id());
381 __show_regs(regs); 395 __show_regs(regs);
382 show_trace(NULL, regs, (void *)(regs + 1)); 396 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
383} 397}
384 398
385/* 399/*
@@ -390,7 +404,7 @@ void exit_thread(void)
390 struct task_struct *me = current; 404 struct task_struct *me = current;
391 struct thread_struct *t = &me->thread; 405 struct thread_struct *t = &me->thread;
392 406
393 if (me->thread.io_bitmap_ptr) { 407 if (me->thread.io_bitmap_ptr) {
394 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 408 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
395 409
396 kfree(t->io_bitmap_ptr); 410 kfree(t->io_bitmap_ptr);
@@ -426,7 +440,7 @@ void flush_thread(void)
426 tsk->thread.debugreg3 = 0; 440 tsk->thread.debugreg3 = 0;
427 tsk->thread.debugreg6 = 0; 441 tsk->thread.debugreg6 = 0;
428 tsk->thread.debugreg7 = 0; 442 tsk->thread.debugreg7 = 0;
429 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 443 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
430 /* 444 /*
431 * Forget coprocessor state.. 445 * Forget coprocessor state..
432 */ 446 */
@@ -449,26 +463,21 @@ void release_thread(struct task_struct *dead_task)
449 463
450static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 464static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
451{ 465{
452 struct user_desc ud = { 466 struct user_desc ud = {
453 .base_addr = addr, 467 .base_addr = addr,
454 .limit = 0xfffff, 468 .limit = 0xfffff,
455 .seg_32bit = 1, 469 .seg_32bit = 1,
456 .limit_in_pages = 1, 470 .limit_in_pages = 1,
457 .useable = 1, 471 .useable = 1,
458 }; 472 };
459 struct n_desc_struct *desc = (void *)t->thread.tls_array; 473 struct desc_struct *desc = t->thread.tls_array;
460 desc += tls; 474 desc += tls;
461 desc->a = LDT_entry_a(&ud); 475 fill_ldt(desc, &ud);
462 desc->b = LDT_entry_b(&ud);
463} 476}
464 477
465static inline u32 read_32bit_tls(struct task_struct *t, int tls) 478static inline u32 read_32bit_tls(struct task_struct *t, int tls)
466{ 479{
467 struct desc_struct *desc = (void *)t->thread.tls_array; 480 return get_desc_base(&t->thread.tls_array[tls]);
468 desc += tls;
469 return desc->base0 |
470 (((u32)desc->base1) << 16) |
471 (((u32)desc->base2) << 24);
472} 481}
473 482
474/* 483/*
@@ -480,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk)
480 unlazy_fpu(tsk); 489 unlazy_fpu(tsk);
481} 490}
482 491
483int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 492int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
484 unsigned long unused, 493 unsigned long unused,
485 struct task_struct * p, struct pt_regs * regs) 494 struct task_struct * p, struct pt_regs * regs)
486{ 495{
@@ -492,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
492 (THREAD_SIZE + task_stack_page(p))) - 1; 501 (THREAD_SIZE + task_stack_page(p))) - 1;
493 *childregs = *regs; 502 *childregs = *regs;
494 503
495 childregs->rax = 0; 504 childregs->ax = 0;
496 childregs->rsp = rsp; 505 childregs->sp = sp;
497 if (rsp == ~0UL) 506 if (sp == ~0UL)
498 childregs->rsp = (unsigned long)childregs; 507 childregs->sp = (unsigned long)childregs;
499 508
500 p->thread.rsp = (unsigned long) childregs; 509 p->thread.sp = (unsigned long) childregs;
501 p->thread.rsp0 = (unsigned long) (childregs+1); 510 p->thread.sp0 = (unsigned long) (childregs+1);
502 p->thread.userrsp = me->thread.userrsp; 511 p->thread.usersp = me->thread.usersp;
503 512
504 set_tsk_thread_flag(p, TIF_FORK); 513 set_tsk_thread_flag(p, TIF_FORK);
505 514
@@ -520,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
520 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 529 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
521 IO_BITMAP_BYTES); 530 IO_BITMAP_BYTES);
522 set_tsk_thread_flag(p, TIF_IO_BITMAP); 531 set_tsk_thread_flag(p, TIF_IO_BITMAP);
523 } 532 }
524 533
525 /* 534 /*
526 * Set a new TLS for the child thread? 535 * Set a new TLS for the child thread?
@@ -528,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
528 if (clone_flags & CLONE_SETTLS) { 537 if (clone_flags & CLONE_SETTLS) {
529#ifdef CONFIG_IA32_EMULATION 538#ifdef CONFIG_IA32_EMULATION
530 if (test_thread_flag(TIF_IA32)) 539 if (test_thread_flag(TIF_IA32))
531 err = ia32_child_tls(p, childregs); 540 err = do_set_thread_area(p, -1,
541 (struct user_desc __user *)childregs->si, 0);
532 else 542 else
533#endif 543#endif
534 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 544 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
@@ -547,17 +557,30 @@ out:
547/* 557/*
548 * This special macro can be used to load a debugging register 558 * This special macro can be used to load a debugging register
549 */ 559 */
550#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 560#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
551 561
552static inline void __switch_to_xtra(struct task_struct *prev_p, 562static inline void __switch_to_xtra(struct task_struct *prev_p,
553 struct task_struct *next_p, 563 struct task_struct *next_p,
554 struct tss_struct *tss) 564 struct tss_struct *tss)
555{ 565{
556 struct thread_struct *prev, *next; 566 struct thread_struct *prev, *next;
567 unsigned long debugctl;
557 568
558 prev = &prev_p->thread, 569 prev = &prev_p->thread,
559 next = &next_p->thread; 570 next = &next_p->thread;
560 571
572 debugctl = prev->debugctlmsr;
573 if (next->ds_area_msr != prev->ds_area_msr) {
574 /* we clear debugctl to make sure DS
575 * is not in use when we change it */
576 debugctl = 0;
577 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
578 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
579 }
580
581 if (next->debugctlmsr != debugctl)
582 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
583
561 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 584 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
562 loaddebug(next, 0); 585 loaddebug(next, 0);
563 loaddebug(next, 1); 586 loaddebug(next, 1);
@@ -581,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
581 */ 604 */
582 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 605 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
583 } 606 }
607
608 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
609 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
610
611 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
612 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
584} 613}
585 614
586/* 615/*
587 * switch_to(x,y) should switch tasks from x to y. 616 * switch_to(x,y) should switch tasks from x to y.
588 * 617 *
589 * This could still be optimized: 618 * This could still be optimized:
590 * - fold all the options into a flag word and test it with a single test. 619 * - fold all the options into a flag word and test it with a single test.
591 * - could test fs/gs bitsliced 620 * - could test fs/gs bitsliced
592 * 621 *
@@ -597,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
597{ 626{
598 struct thread_struct *prev = &prev_p->thread, 627 struct thread_struct *prev = &prev_p->thread,
599 *next = &next_p->thread; 628 *next = &next_p->thread;
600 int cpu = smp_processor_id(); 629 int cpu = smp_processor_id();
601 struct tss_struct *tss = &per_cpu(init_tss, cpu); 630 struct tss_struct *tss = &per_cpu(init_tss, cpu);
602 631
603 /* we're going to use this soon, after a few expensive things */ 632 /* we're going to use this soon, after a few expensive things */
@@ -607,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
607 /* 636 /*
608 * Reload esp0, LDT and the page table pointer: 637 * Reload esp0, LDT and the page table pointer:
609 */ 638 */
610 tss->rsp0 = next->rsp0; 639 load_sp0(tss, next);
611 640
612 /* 641 /*
613 * Switch DS and ES. 642 * Switch DS and ES.
@@ -666,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
666 /* 695 /*
667 * Switch the PDA and FPU contexts. 696 * Switch the PDA and FPU contexts.
668 */ 697 */
669 prev->userrsp = read_pda(oldrsp); 698 prev->usersp = read_pda(oldrsp);
670 write_pda(oldrsp, next->userrsp); 699 write_pda(oldrsp, next->usersp);
671 write_pda(pcurrent, next_p); 700 write_pda(pcurrent, next_p);
672 701
673 write_pda(kernelstack, 702 write_pda(kernelstack,
@@ -684,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
684 /* 713 /*
685 * Now maybe reload the debug registers and handle I/O bitmaps 714 * Now maybe reload the debug registers and handle I/O bitmaps
686 */ 715 */
687 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) 716 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
688 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) 717 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
689 __switch_to_xtra(prev_p, next_p, tss); 718 __switch_to_xtra(prev_p, next_p, tss);
690 719
691 /* If the task has used fpu the last 5 timeslices, just do a full 720 /* If the task has used fpu the last 5 timeslices, just do a full
@@ -700,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
700/* 729/*
701 * sys_execve() executes a new program. 730 * sys_execve() executes a new program.
702 */ 731 */
703asmlinkage 732asmlinkage
704long sys_execve(char __user *name, char __user * __user *argv, 733long sys_execve(char __user *name, char __user * __user *argv,
705 char __user * __user *envp, struct pt_regs regs) 734 char __user * __user *envp, struct pt_regs regs)
706{ 735{
@@ -712,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv,
712 if (IS_ERR(filename)) 741 if (IS_ERR(filename))
713 return error; 742 return error;
714 error = do_execve(filename, argv, envp, &regs); 743 error = do_execve(filename, argv, envp, &regs);
715 if (error == 0) {
716 task_lock(current);
717 current->ptrace &= ~PT_DTRACE;
718 task_unlock(current);
719 }
720 putname(filename); 744 putname(filename);
721 return error; 745 return error;
722} 746}
@@ -726,18 +750,18 @@ void set_personality_64bit(void)
726 /* inherit personality from parent */ 750 /* inherit personality from parent */
727 751
728 /* Make sure to be in 64bit mode */ 752 /* Make sure to be in 64bit mode */
729 clear_thread_flag(TIF_IA32); 753 clear_thread_flag(TIF_IA32);
730 754
731 /* TBD: overwrites user setup. Should have two bits. 755 /* TBD: overwrites user setup. Should have two bits.
732 But 64bit processes have always behaved this way, 756 But 64bit processes have always behaved this way,
733 so it's not too bad. The main problem is just that 757 so it's not too bad. The main problem is just that
734 32bit childs are affected again. */ 758 32bit childs are affected again. */
735 current->personality &= ~READ_IMPLIES_EXEC; 759 current->personality &= ~READ_IMPLIES_EXEC;
736} 760}
737 761
738asmlinkage long sys_fork(struct pt_regs *regs) 762asmlinkage long sys_fork(struct pt_regs *regs)
739{ 763{
740 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 764 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
741} 765}
742 766
743asmlinkage long 767asmlinkage long
@@ -745,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
745 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 769 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
746{ 770{
747 if (!newsp) 771 if (!newsp)
748 newsp = regs->rsp; 772 newsp = regs->sp;
749 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 773 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
750} 774}
751 775
@@ -761,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
761 */ 785 */
762asmlinkage long sys_vfork(struct pt_regs *regs) 786asmlinkage long sys_vfork(struct pt_regs *regs)
763{ 787{
764 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, 788 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
765 NULL, NULL); 789 NULL, NULL);
766} 790}
767 791
768unsigned long get_wchan(struct task_struct *p) 792unsigned long get_wchan(struct task_struct *p)
769{ 793{
770 unsigned long stack; 794 unsigned long stack;
771 u64 fp,rip; 795 u64 fp,ip;
772 int count = 0; 796 int count = 0;
773 797
774 if (!p || p == current || p->state==TASK_RUNNING) 798 if (!p || p == current || p->state==TASK_RUNNING)
775 return 0; 799 return 0;
776 stack = (unsigned long)task_stack_page(p); 800 stack = (unsigned long)task_stack_page(p);
777 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 801 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
778 return 0; 802 return 0;
779 fp = *(u64 *)(p->thread.rsp); 803 fp = *(u64 *)(p->thread.sp);
780 do { 804 do {
781 if (fp < (unsigned long)stack || 805 if (fp < (unsigned long)stack ||
782 fp > (unsigned long)stack+THREAD_SIZE) 806 fp > (unsigned long)stack+THREAD_SIZE)
783 return 0; 807 return 0;
784 rip = *(u64 *)(fp+8); 808 ip = *(u64 *)(fp+8);
785 if (!in_sched_functions(rip)) 809 if (!in_sched_functions(ip))
786 return rip; 810 return ip;
787 fp = *(u64 *)fp; 811 fp = *(u64 *)fp;
788 } while (count++ < 16); 812 } while (count++ < 16);
789 return 0; 813 return 0;
@@ -824,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 /* Not strictly needed for fs, but do it for symmetry 848 /* Not strictly needed for fs, but do it for symmetry
825 with gs */ 849 with gs */
826 if (addr >= TASK_SIZE_OF(task)) 850 if (addr >= TASK_SIZE_OF(task))
827 return -EPERM; 851 return -EPERM;
828 cpu = get_cpu(); 852 cpu = get_cpu();
829 /* handle small bases via the GDT because that's faster to 853 /* handle small bases via the GDT because that's faster to
830 switch. */ 854 switch. */
831 if (addr <= 0xffffffff) { 855 if (addr <= 0xffffffff) {
832 set_32bit_tls(task, FS_TLS, addr); 856 set_32bit_tls(task, FS_TLS, addr);
833 if (doit) { 857 if (doit) {
834 load_TLS(&task->thread, cpu); 858 load_TLS(&task->thread, cpu);
835 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 859 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
836 } 860 }
837 task->thread.fsindex = FS_TLS_SEL; 861 task->thread.fsindex = FS_TLS_SEL;
838 task->thread.fs = 0; 862 task->thread.fs = 0;
839 } else { 863 } else {
840 task->thread.fsindex = 0; 864 task->thread.fsindex = 0;
841 task->thread.fs = addr; 865 task->thread.fs = addr;
842 if (doit) { 866 if (doit) {
@@ -848,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
848 } 872 }
849 put_cpu(); 873 put_cpu();
850 break; 874 break;
851 case ARCH_GET_FS: { 875 case ARCH_GET_FS: {
852 unsigned long base; 876 unsigned long base;
853 if (task->thread.fsindex == FS_TLS_SEL) 877 if (task->thread.fsindex == FS_TLS_SEL)
854 base = read_32bit_tls(task, FS_TLS); 878 base = read_32bit_tls(task, FS_TLS);
855 else if (doit) 879 else if (doit)
856 rdmsrl(MSR_FS_BASE, base); 880 rdmsrl(MSR_FS_BASE, base);
857 else 881 else
858 base = task->thread.fs; 882 base = task->thread.fs;
859 ret = put_user(base, (unsigned long __user *)addr); 883 ret = put_user(base, (unsigned long __user *)addr);
860 break; 884 break;
861 } 885 }
862 case ARCH_GET_GS: { 886 case ARCH_GET_GS: {
863 unsigned long base; 887 unsigned long base;
864 unsigned gsindex; 888 unsigned gsindex;
865 if (task->thread.gsindex == GS_TLS_SEL) 889 if (task->thread.gsindex == GS_TLS_SEL)
866 base = read_32bit_tls(task, GS_TLS); 890 base = read_32bit_tls(task, GS_TLS);
867 else if (doit) { 891 else if (doit) {
868 asm("movl %%gs,%0" : "=r" (gsindex)); 892 asm("movl %%gs,%0" : "=r" (gsindex));
869 if (gsindex) 893 if (gsindex)
870 rdmsrl(MSR_KERNEL_GS_BASE, base); 894 rdmsrl(MSR_KERNEL_GS_BASE, base);
871 else 895 else
@@ -873,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
873 } 897 }
874 else 898 else
875 base = task->thread.gs; 899 base = task->thread.gs;
876 ret = put_user(base, (unsigned long __user *)addr); 900 ret = put_user(base, (unsigned long __user *)addr);
877 break; 901 break;
878 } 902 }
879 903
880 default: 904 default:
881 ret = -EINVAL; 905 ret = -EINVAL;
882 break; 906 break;
883 } 907 }
884 908
885 return ret; 909 return ret;
886} 910}
887 911
888long sys_arch_prctl(int code, unsigned long addr) 912long sys_arch_prctl(int code, unsigned long addr)
889{ 913{
890 return do_arch_prctl(current, code, addr); 914 return do_arch_prctl(current, code, addr);
891}
892
893/*
894 * Capture the user space registers if the task is not running (in user space)
895 */
896int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
897{
898 struct pt_regs *pp, ptregs;
899
900 pp = task_pt_regs(tsk);
901
902 ptregs = *pp;
903 ptregs.cs &= 0xffff;
904 ptregs.ss &= 0xffff;
905
906 elf_core_copy_regs(regs, &ptregs);
907
908 return 1;
909} 915}
910 916
911unsigned long arch_align_stack(unsigned long sp) 917unsigned long arch_align_stack(unsigned long sp)
@@ -914,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp)
914 sp -= get_random_int() % 8192; 920 sp -= get_random_int() % 8192;
915 return sp & ~0xf; 921 return sp & ~0xf;
916} 922}
923
924unsigned long arch_randomize_brk(struct mm_struct *mm)
925{
926 unsigned long range_end = mm->brk + 0x02000000;
927 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
928}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 000000000000..96286df1bb81
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1545 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/regset.h>
17#include <linux/user.h>
18#include <linux/elf.h>
19#include <linux/security.h>
20#include <linux/audit.h>
21#include <linux/seccomp.h>
22#include <linux/signal.h>
23
24#include <asm/uaccess.h>
25#include <asm/pgtable.h>
26#include <asm/system.h>
27#include <asm/processor.h>
28#include <asm/i387.h>
29#include <asm/debugreg.h>
30#include <asm/ldt.h>
31#include <asm/desc.h>
32#include <asm/prctl.h>
33#include <asm/proto.h>
34#include <asm/ds.h>
35
36#include "tls.h"
37
38enum x86_regset {
39 REGSET_GENERAL,
40 REGSET_FP,
41 REGSET_XFP,
42 REGSET_TLS,
43};
44
45/*
46 * does not yet catch signals sent when the child dies.
47 * in exit.c or in signal.c.
48 */
49
50/*
51 * Determines which flags the user has access to [1 = access, 0 = no access].
52 */
53#define FLAG_MASK_32 ((unsigned long) \
54 (X86_EFLAGS_CF | X86_EFLAGS_PF | \
55 X86_EFLAGS_AF | X86_EFLAGS_ZF | \
56 X86_EFLAGS_SF | X86_EFLAGS_TF | \
57 X86_EFLAGS_DF | X86_EFLAGS_OF | \
58 X86_EFLAGS_RF | X86_EFLAGS_AC))
59
60/*
61 * Determines whether a value may be installed in a segment register.
62 */
63static inline bool invalid_selector(u16 value)
64{
65 return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
66}
67
68#ifdef CONFIG_X86_32
69
70#define FLAG_MASK FLAG_MASK_32
71
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2;
76 if (regno > FS)
77 --regno;
78 return &regs->bx + regno;
79}
80
81static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
82{
83 /*
84 * Returning the value truncates it to 16 bits.
85 */
86 unsigned int retval;
87 if (offset != offsetof(struct user_regs_struct, gs))
88 retval = *pt_regs_access(task_pt_regs(task), offset);
89 else {
90 retval = task->thread.gs;
91 if (task == current)
92 savesegment(gs, retval);
93 }
94 return retval;
95}
96
97static int set_segment_reg(struct task_struct *task,
98 unsigned long offset, u16 value)
99{
100 /*
101 * The value argument was already truncated to 16 bits.
102 */
103 if (invalid_selector(value))
104 return -EIO;
105
106 if (offset != offsetof(struct user_regs_struct, gs))
107 *pt_regs_access(task_pt_regs(task), offset) = value;
108 else {
109 task->thread.gs = value;
110 if (task == current)
111 /*
112 * The user-mode %gs is not affected by
113 * kernel entry, so we must update the CPU.
114 */
115 loadsegment(gs, value);
116 }
117
118 return 0;
119}
120
121static unsigned long debugreg_addr_limit(struct task_struct *task)
122{
123 return TASK_SIZE - 3;
124}
125
126#else /* CONFIG_X86_64 */
127
128#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
129
130static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
131{
132 BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
133 return &regs->r15 + (offset / sizeof(regs->r15));
134}
135
136static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
137{
138 /*
139 * Returning the value truncates it to 16 bits.
140 */
141 unsigned int seg;
142
143 switch (offset) {
144 case offsetof(struct user_regs_struct, fs):
145 if (task == current) {
146 /* Older gas can't assemble movq %?s,%r?? */
147 asm("movl %%fs,%0" : "=r" (seg));
148 return seg;
149 }
150 return task->thread.fsindex;
151 case offsetof(struct user_regs_struct, gs):
152 if (task == current) {
153 asm("movl %%gs,%0" : "=r" (seg));
154 return seg;
155 }
156 return task->thread.gsindex;
157 case offsetof(struct user_regs_struct, ds):
158 if (task == current) {
159 asm("movl %%ds,%0" : "=r" (seg));
160 return seg;
161 }
162 return task->thread.ds;
163 case offsetof(struct user_regs_struct, es):
164 if (task == current) {
165 asm("movl %%es,%0" : "=r" (seg));
166 return seg;
167 }
168 return task->thread.es;
169
170 case offsetof(struct user_regs_struct, cs):
171 case offsetof(struct user_regs_struct, ss):
172 break;
173 }
174 return *pt_regs_access(task_pt_regs(task), offset);
175}
176
177static int set_segment_reg(struct task_struct *task,
178 unsigned long offset, u16 value)
179{
180 /*
181 * The value argument was already truncated to 16 bits.
182 */
183 if (invalid_selector(value))
184 return -EIO;
185
186 switch (offset) {
187 case offsetof(struct user_regs_struct,fs):
188 /*
189 * If this is setting fs as for normal 64-bit use but
190 * setting fs_base has implicitly changed it, leave it.
191 */
192 if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
193 task->thread.fs != 0) ||
194 (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
195 task->thread.fs == 0))
196 break;
197 task->thread.fsindex = value;
198 if (task == current)
199 loadsegment(fs, task->thread.fsindex);
200 break;
201 case offsetof(struct user_regs_struct,gs):
202 /*
203 * If this is setting gs as for normal 64-bit use but
204 * setting gs_base has implicitly changed it, leave it.
205 */
206 if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
207 task->thread.gs != 0) ||
208 (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
209 task->thread.gs == 0))
210 break;
211 task->thread.gsindex = value;
212 if (task == current)
213 load_gs_index(task->thread.gsindex);
214 break;
215 case offsetof(struct user_regs_struct,ds):
216 task->thread.ds = value;
217 if (task == current)
218 loadsegment(ds, task->thread.ds);
219 break;
220 case offsetof(struct user_regs_struct,es):
221 task->thread.es = value;
222 if (task == current)
223 loadsegment(es, task->thread.es);
224 break;
225
226 /*
227 * Can't actually change these in 64-bit mode.
228 */
229 case offsetof(struct user_regs_struct,cs):
230#ifdef CONFIG_IA32_EMULATION
231 if (test_tsk_thread_flag(task, TIF_IA32))
232 task_pt_regs(task)->cs = value;
233#endif
234 break;
235 case offsetof(struct user_regs_struct,ss):
236#ifdef CONFIG_IA32_EMULATION
237 if (test_tsk_thread_flag(task, TIF_IA32))
238 task_pt_regs(task)->ss = value;
239#endif
240 break;
241 }
242
243 return 0;
244}
245
246static unsigned long debugreg_addr_limit(struct task_struct *task)
247{
248#ifdef CONFIG_IA32_EMULATION
249 if (test_tsk_thread_flag(task, TIF_IA32))
250 return IA32_PAGE_OFFSET - 3;
251#endif
252 return TASK_SIZE64 - 7;
253}
254
255#endif /* CONFIG_X86_32 */
256
257static unsigned long get_flags(struct task_struct *task)
258{
259 unsigned long retval = task_pt_regs(task)->flags;
260
261 /*
262 * If the debugger set TF, hide it from the readout.
263 */
264 if (test_tsk_thread_flag(task, TIF_FORCED_TF))
265 retval &= ~X86_EFLAGS_TF;
266
267 return retval;
268}
269
270static int set_flags(struct task_struct *task, unsigned long value)
271{
272 struct pt_regs *regs = task_pt_regs(task);
273
274 /*
275 * If the user value contains TF, mark that
276 * it was not "us" (the debugger) that set it.
277 * If not, make sure it stays set if we had.
278 */
279 if (value & X86_EFLAGS_TF)
280 clear_tsk_thread_flag(task, TIF_FORCED_TF);
281 else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
282 value |= X86_EFLAGS_TF;
283
284 regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
285
286 return 0;
287}
288
289static int putreg(struct task_struct *child,
290 unsigned long offset, unsigned long value)
291{
292 switch (offset) {
293 case offsetof(struct user_regs_struct, cs):
294 case offsetof(struct user_regs_struct, ds):
295 case offsetof(struct user_regs_struct, es):
296 case offsetof(struct user_regs_struct, fs):
297 case offsetof(struct user_regs_struct, gs):
298 case offsetof(struct user_regs_struct, ss):
299 return set_segment_reg(child, offset, value);
300
301 case offsetof(struct user_regs_struct, flags):
302 return set_flags(child, value);
303
304#ifdef CONFIG_X86_64
305 case offsetof(struct user_regs_struct,fs_base):
306 if (value >= TASK_SIZE_OF(child))
307 return -EIO;
308 /*
309 * When changing the segment base, use do_arch_prctl
310 * to set either thread.fs or thread.fsindex and the
311 * corresponding GDT slot.
312 */
313 if (child->thread.fs != value)
314 return do_arch_prctl(child, ARCH_SET_FS, value);
315 return 0;
316 case offsetof(struct user_regs_struct,gs_base):
317 /*
318 * Exactly the same here as the %fs handling above.
319 */
320 if (value >= TASK_SIZE_OF(child))
321 return -EIO;
322 if (child->thread.gs != value)
323 return do_arch_prctl(child, ARCH_SET_GS, value);
324 return 0;
325#endif
326 }
327
328 *pt_regs_access(task_pt_regs(child), offset) = value;
329 return 0;
330}
331
332static unsigned long getreg(struct task_struct *task, unsigned long offset)
333{
334 switch (offset) {
335 case offsetof(struct user_regs_struct, cs):
336 case offsetof(struct user_regs_struct, ds):
337 case offsetof(struct user_regs_struct, es):
338 case offsetof(struct user_regs_struct, fs):
339 case offsetof(struct user_regs_struct, gs):
340 case offsetof(struct user_regs_struct, ss):
341 return get_segment_reg(task, offset);
342
343 case offsetof(struct user_regs_struct, flags):
344 return get_flags(task);
345
346#ifdef CONFIG_X86_64
347 case offsetof(struct user_regs_struct, fs_base): {
348 /*
349 * do_arch_prctl may have used a GDT slot instead of
350 * the MSR. To userland, it appears the same either
351 * way, except the %fs segment selector might not be 0.
352 */
353 unsigned int seg = task->thread.fsindex;
354 if (task->thread.fs != 0)
355 return task->thread.fs;
356 if (task == current)
357 asm("movl %%fs,%0" : "=r" (seg));
358 if (seg != FS_TLS_SEL)
359 return 0;
360 return get_desc_base(&task->thread.tls_array[FS_TLS]);
361 }
362 case offsetof(struct user_regs_struct, gs_base): {
363 /*
364 * Exactly the same here as the %fs handling above.
365 */
366 unsigned int seg = task->thread.gsindex;
367 if (task->thread.gs != 0)
368 return task->thread.gs;
369 if (task == current)
370 asm("movl %%gs,%0" : "=r" (seg));
371 if (seg != GS_TLS_SEL)
372 return 0;
373 return get_desc_base(&task->thread.tls_array[GS_TLS]);
374 }
375#endif
376 }
377
378 return *pt_regs_access(task_pt_regs(task), offset);
379}
380
381static int genregs_get(struct task_struct *target,
382 const struct user_regset *regset,
383 unsigned int pos, unsigned int count,
384 void *kbuf, void __user *ubuf)
385{
386 if (kbuf) {
387 unsigned long *k = kbuf;
388 while (count > 0) {
389 *k++ = getreg(target, pos);
390 count -= sizeof(*k);
391 pos += sizeof(*k);
392 }
393 } else {
394 unsigned long __user *u = ubuf;
395 while (count > 0) {
396 if (__put_user(getreg(target, pos), u++))
397 return -EFAULT;
398 count -= sizeof(*u);
399 pos += sizeof(*u);
400 }
401 }
402
403 return 0;
404}
405
406static int genregs_set(struct task_struct *target,
407 const struct user_regset *regset,
408 unsigned int pos, unsigned int count,
409 const void *kbuf, const void __user *ubuf)
410{
411 int ret = 0;
412 if (kbuf) {
413 const unsigned long *k = kbuf;
414 while (count > 0 && !ret) {
415 ret = putreg(target, pos, *k++);
416 count -= sizeof(*k);
417 pos += sizeof(*k);
418 }
419 } else {
420 const unsigned long __user *u = ubuf;
421 while (count > 0 && !ret) {
422 unsigned long word;
423 ret = __get_user(word, u++);
424 if (ret)
425 break;
426 ret = putreg(target, pos, word);
427 count -= sizeof(*u);
428 pos += sizeof(*u);
429 }
430 }
431 return ret;
432}
433
434/*
435 * This function is trivial and will be inlined by the compiler.
436 * Having it separates the implementation details of debug
437 * registers from the interface details of ptrace.
438 */
439static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
440{
441 switch (n) {
442 case 0: return child->thread.debugreg0;
443 case 1: return child->thread.debugreg1;
444 case 2: return child->thread.debugreg2;
445 case 3: return child->thread.debugreg3;
446 case 6: return child->thread.debugreg6;
447 case 7: return child->thread.debugreg7;
448 }
449 return 0;
450}
451
452static int ptrace_set_debugreg(struct task_struct *child,
453 int n, unsigned long data)
454{
455 int i;
456
457 if (unlikely(n == 4 || n == 5))
458 return -EIO;
459
460 if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
461 return -EIO;
462
463 switch (n) {
464 case 0: child->thread.debugreg0 = data; break;
465 case 1: child->thread.debugreg1 = data; break;
466 case 2: child->thread.debugreg2 = data; break;
467 case 3: child->thread.debugreg3 = data; break;
468
469 case 6:
470 if ((data & ~0xffffffffUL) != 0)
471 return -EIO;
472 child->thread.debugreg6 = data;
473 break;
474
475 case 7:
476 /*
477 * Sanity-check data. Take one half-byte at once with
478 * check = (val >> (16 + 4*i)) & 0xf. It contains the
479 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
480 * 2 and 3 are LENi. Given a list of invalid values,
481 * we do mask |= 1 << invalid_value, so that
482 * (mask >> check) & 1 is a correct test for invalid
483 * values.
484 *
485 * R/Wi contains the type of the breakpoint /
486 * watchpoint, LENi contains the length of the watched
487 * data in the watchpoint case.
488 *
489 * The invalid values are:
490 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
491 * - R/Wi == 0x10 (break on I/O reads or writes), so
492 * mask |= 0x4444.
493 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
494 * 0x1110.
495 *
496 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
497 *
498 * See the Intel Manual "System Programming Guide",
499 * 15.2.4
500 *
501 * Note that LENi == 0x10 is defined on x86_64 in long
502 * mode (i.e. even for 32-bit userspace software, but
503 * 64-bit kernel), so the x86_64 mask value is 0x5454.
504 * See the AMD manual no. 24593 (AMD64 System Programming)
505 */
506#ifdef CONFIG_X86_32
507#define DR7_MASK 0x5f54
508#else
509#define DR7_MASK 0x5554
510#endif
511 data &= ~DR_CONTROL_RESERVED;
512 for (i = 0; i < 4; i++)
513 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
514 return -EIO;
515 child->thread.debugreg7 = data;
516 if (data)
517 set_tsk_thread_flag(child, TIF_DEBUG);
518 else
519 clear_tsk_thread_flag(child, TIF_DEBUG);
520 break;
521 }
522
523 return 0;
524}
525
526static int ptrace_bts_get_size(struct task_struct *child)
527{
528 if (!child->thread.ds_area_msr)
529 return -ENXIO;
530
531 return ds_get_bts_index((void *)child->thread.ds_area_msr);
532}
533
534static int ptrace_bts_read_record(struct task_struct *child,
535 long index,
536 struct bts_struct __user *out)
537{
538 struct bts_struct ret;
539 int retval;
540 int bts_end;
541 int bts_index;
542
543 if (!child->thread.ds_area_msr)
544 return -ENXIO;
545
546 if (index < 0)
547 return -EINVAL;
548
549 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
550 if (bts_end <= index)
551 return -EINVAL;
552
553 /* translate the ptrace bts index into the ds bts index */
554 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
555 bts_index -= (index + 1);
556 if (bts_index < 0)
557 bts_index += bts_end;
558
559 retval = ds_read_bts((void *)child->thread.ds_area_msr,
560 bts_index, &ret);
561 if (retval < 0)
562 return retval;
563
564 if (copy_to_user(out, &ret, sizeof(ret)))
565 return -EFAULT;
566
567 return sizeof(ret);
568}
569
570static int ptrace_bts_write_record(struct task_struct *child,
571 const struct bts_struct *in)
572{
573 int retval;
574
575 if (!child->thread.ds_area_msr)
576 return -ENXIO;
577
578 retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
579 if (retval)
580 return retval;
581
582 return sizeof(*in);
583}
584
585static int ptrace_bts_clear(struct task_struct *child)
586{
587 if (!child->thread.ds_area_msr)
588 return -ENXIO;
589
590 return ds_clear((void *)child->thread.ds_area_msr);
591}
592
593static int ptrace_bts_drain(struct task_struct *child,
594 long size,
595 struct bts_struct __user *out)
596{
597 int end, i;
598 void *ds = (void *)child->thread.ds_area_msr;
599
600 if (!ds)
601 return -ENXIO;
602
603 end = ds_get_bts_index(ds);
604 if (end <= 0)
605 return end;
606
607 if (size < (end * sizeof(struct bts_struct)))
608 return -EIO;
609
610 for (i = 0; i < end; i++, out++) {
611 struct bts_struct ret;
612 int retval;
613
614 retval = ds_read_bts(ds, i, &ret);
615 if (retval < 0)
616 return retval;
617
618 if (copy_to_user(out, &ret, sizeof(ret)))
619 return -EFAULT;
620 }
621
622 ds_clear(ds);
623
624 return end;
625}
626
627static int ptrace_bts_realloc(struct task_struct *child,
628 int size, int reduce_size)
629{
630 unsigned long rlim, vm;
631 int ret, old_size;
632
633 if (size < 0)
634 return -EINVAL;
635
636 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
637 if (old_size < 0)
638 return old_size;
639
640 ret = ds_free((void **)&child->thread.ds_area_msr);
641 if (ret < 0)
642 goto out;
643
644 size >>= PAGE_SHIFT;
645 old_size >>= PAGE_SHIFT;
646
647 current->mm->total_vm -= old_size;
648 current->mm->locked_vm -= old_size;
649
650 if (size == 0)
651 goto out;
652
653 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
654 vm = current->mm->total_vm + size;
655 if (rlim < vm) {
656 ret = -ENOMEM;
657
658 if (!reduce_size)
659 goto out;
660
661 size = rlim - current->mm->total_vm;
662 if (size <= 0)
663 goto out;
664 }
665
666 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
667 vm = current->mm->locked_vm + size;
668 if (rlim < vm) {
669 ret = -ENOMEM;
670
671 if (!reduce_size)
672 goto out;
673
674 size = rlim - current->mm->locked_vm;
675 if (size <= 0)
676 goto out;
677 }
678
679 ret = ds_allocate((void **)&child->thread.ds_area_msr,
680 size << PAGE_SHIFT);
681 if (ret < 0)
682 goto out;
683
684 current->mm->total_vm += size;
685 current->mm->locked_vm += size;
686
687out:
688 if (child->thread.ds_area_msr)
689 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
690 else
691 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
692
693 return ret;
694}
695
696static int ptrace_bts_config(struct task_struct *child,
697 long cfg_size,
698 const struct ptrace_bts_config __user *ucfg)
699{
700 struct ptrace_bts_config cfg;
701 int bts_size, ret = 0;
702 void *ds;
703
704 if (cfg_size < sizeof(cfg))
705 return -EIO;
706
707 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
708 return -EFAULT;
709
710 if ((int)cfg.size < 0)
711 return -EINVAL;
712
713 bts_size = 0;
714 ds = (void *)child->thread.ds_area_msr;
715 if (ds) {
716 bts_size = ds_get_bts_size(ds);
717 if (bts_size < 0)
718 return bts_size;
719 }
720 cfg.size = PAGE_ALIGN(cfg.size);
721
722 if (bts_size != cfg.size) {
723 ret = ptrace_bts_realloc(child, cfg.size,
724 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
725 if (ret < 0)
726 goto errout;
727
728 ds = (void *)child->thread.ds_area_msr;
729 }
730
731 if (cfg.flags & PTRACE_BTS_O_SIGNAL)
732 ret = ds_set_overflow(ds, DS_O_SIGNAL);
733 else
734 ret = ds_set_overflow(ds, DS_O_WRAP);
735 if (ret < 0)
736 goto errout;
737
738 if (cfg.flags & PTRACE_BTS_O_TRACE)
739 child->thread.debugctlmsr |= ds_debugctl_mask();
740 else
741 child->thread.debugctlmsr &= ~ds_debugctl_mask();
742
743 if (cfg.flags & PTRACE_BTS_O_SCHED)
744 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
745 else
746 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
747
748 ret = sizeof(cfg);
749
750out:
751 if (child->thread.debugctlmsr)
752 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
753 else
754 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
755
756 return ret;
757
758errout:
759 child->thread.debugctlmsr &= ~ds_debugctl_mask();
760 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
761 goto out;
762}
763
764static int ptrace_bts_status(struct task_struct *child,
765 long cfg_size,
766 struct ptrace_bts_config __user *ucfg)
767{
768 void *ds = (void *)child->thread.ds_area_msr;
769 struct ptrace_bts_config cfg;
770
771 if (cfg_size < sizeof(cfg))
772 return -EIO;
773
774 memset(&cfg, 0, sizeof(cfg));
775
776 if (ds) {
777 cfg.size = ds_get_bts_size(ds);
778
779 if (ds_get_overflow(ds) == DS_O_SIGNAL)
780 cfg.flags |= PTRACE_BTS_O_SIGNAL;
781
782 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
783 child->thread.debugctlmsr & ds_debugctl_mask())
784 cfg.flags |= PTRACE_BTS_O_TRACE;
785
786 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
787 cfg.flags |= PTRACE_BTS_O_SCHED;
788 }
789
790 cfg.bts_size = sizeof(struct bts_struct);
791
792 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
793 return -EFAULT;
794
795 return sizeof(cfg);
796}
797
798void ptrace_bts_take_timestamp(struct task_struct *tsk,
799 enum bts_qualifier qualifier)
800{
801 struct bts_struct rec = {
802 .qualifier = qualifier,
803 .variant.jiffies = jiffies_64
804 };
805
806 ptrace_bts_write_record(tsk, &rec);
807}
808
809/*
810 * Called by kernel/ptrace.c when detaching..
811 *
812 * Make sure the single step bit is not set.
813 */
814void ptrace_disable(struct task_struct *child)
815{
816 user_disable_single_step(child);
817#ifdef TIF_SYSCALL_EMU
818 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
819#endif
820 if (child->thread.ds_area_msr) {
821 ptrace_bts_realloc(child, 0, 0);
822 child->thread.debugctlmsr &= ~ds_debugctl_mask();
823 if (!child->thread.debugctlmsr)
824 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
825 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
826 }
827}
828
829#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
830static const struct user_regset_view user_x86_32_view; /* Initialized below. */
831#endif
832
833long arch_ptrace(struct task_struct *child, long request, long addr, long data)
834{
835 int ret;
836 unsigned long __user *datap = (unsigned long __user *)data;
837
838 switch (request) {
839 /* read the word at location addr in the USER area. */
840 case PTRACE_PEEKUSR: {
841 unsigned long tmp;
842
843 ret = -EIO;
844 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
845 addr >= sizeof(struct user))
846 break;
847
848 tmp = 0; /* Default return condition */
849 if (addr < sizeof(struct user_regs_struct))
850 tmp = getreg(child, addr);
851 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
852 addr <= offsetof(struct user, u_debugreg[7])) {
853 addr -= offsetof(struct user, u_debugreg[0]);
854 tmp = ptrace_get_debugreg(child, addr / sizeof(data));
855 }
856 ret = put_user(tmp, datap);
857 break;
858 }
859
860 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
861 ret = -EIO;
862 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
863 addr >= sizeof(struct user))
864 break;
865
866 if (addr < sizeof(struct user_regs_struct))
867 ret = putreg(child, addr, data);
868 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
869 addr <= offsetof(struct user, u_debugreg[7])) {
870 addr -= offsetof(struct user, u_debugreg[0]);
871 ret = ptrace_set_debugreg(child,
872 addr / sizeof(data), data);
873 }
874 break;
875
876 case PTRACE_GETREGS: /* Get all gp regs from the child. */
877 return copy_regset_to_user(child,
878 task_user_regset_view(current),
879 REGSET_GENERAL,
880 0, sizeof(struct user_regs_struct),
881 datap);
882
883 case PTRACE_SETREGS: /* Set all gp regs in the child. */
884 return copy_regset_from_user(child,
885 task_user_regset_view(current),
886 REGSET_GENERAL,
887 0, sizeof(struct user_regs_struct),
888 datap);
889
890 case PTRACE_GETFPREGS: /* Get the child FPU state. */
891 return copy_regset_to_user(child,
892 task_user_regset_view(current),
893 REGSET_FP,
894 0, sizeof(struct user_i387_struct),
895 datap);
896
897 case PTRACE_SETFPREGS: /* Set the child FPU state. */
898 return copy_regset_from_user(child,
899 task_user_regset_view(current),
900 REGSET_FP,
901 0, sizeof(struct user_i387_struct),
902 datap);
903
904#ifdef CONFIG_X86_32
905 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
906 return copy_regset_to_user(child, &user_x86_32_view,
907 REGSET_XFP,
908 0, sizeof(struct user_fxsr_struct),
909 datap);
910
911 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
912 return copy_regset_from_user(child, &user_x86_32_view,
913 REGSET_XFP,
914 0, sizeof(struct user_fxsr_struct),
915 datap);
916#endif
917
918#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
919 case PTRACE_GET_THREAD_AREA:
920 if (addr < 0)
921 return -EIO;
922 ret = do_get_thread_area(child, addr,
923 (struct user_desc __user *) data);
924 break;
925
926 case PTRACE_SET_THREAD_AREA:
927 if (addr < 0)
928 return -EIO;
929 ret = do_set_thread_area(child, addr,
930 (struct user_desc __user *) data, 0);
931 break;
932#endif
933
934#ifdef CONFIG_X86_64
935 /* normal 64bit interface to access TLS data.
936 Works just like arch_prctl, except that the arguments
937 are reversed. */
938 case PTRACE_ARCH_PRCTL:
939 ret = do_arch_prctl(child, data, addr);
940 break;
941#endif
942
943 case PTRACE_BTS_CONFIG:
944 ret = ptrace_bts_config
945 (child, data, (struct ptrace_bts_config __user *)addr);
946 break;
947
948 case PTRACE_BTS_STATUS:
949 ret = ptrace_bts_status
950 (child, data, (struct ptrace_bts_config __user *)addr);
951 break;
952
953 case PTRACE_BTS_SIZE:
954 ret = ptrace_bts_get_size(child);
955 break;
956
957 case PTRACE_BTS_GET:
958 ret = ptrace_bts_read_record
959 (child, data, (struct bts_struct __user *) addr);
960 break;
961
962 case PTRACE_BTS_CLEAR:
963 ret = ptrace_bts_clear(child);
964 break;
965
966 case PTRACE_BTS_DRAIN:
967 ret = ptrace_bts_drain
968 (child, data, (struct bts_struct __user *) addr);
969 break;
970
971 default:
972 ret = ptrace_request(child, request, addr, data);
973 break;
974 }
975
976 return ret;
977}
978
979#ifdef CONFIG_IA32_EMULATION
980
981#include <linux/compat.h>
982#include <linux/syscalls.h>
983#include <asm/ia32.h>
984#include <asm/user32.h>
985
986#define R32(l,q) \
987 case offsetof(struct user32, regs.l): \
988 regs->q = value; break
989
990#define SEG32(rs) \
991 case offsetof(struct user32, regs.rs): \
992 return set_segment_reg(child, \
993 offsetof(struct user_regs_struct, rs), \
994 value); \
995 break
996
997static int putreg32(struct task_struct *child, unsigned regno, u32 value)
998{
999 struct pt_regs *regs = task_pt_regs(child);
1000
1001 switch (regno) {
1002
1003 SEG32(cs);
1004 SEG32(ds);
1005 SEG32(es);
1006 SEG32(fs);
1007 SEG32(gs);
1008 SEG32(ss);
1009
1010 R32(ebx, bx);
1011 R32(ecx, cx);
1012 R32(edx, dx);
1013 R32(edi, di);
1014 R32(esi, si);
1015 R32(ebp, bp);
1016 R32(eax, ax);
1017 R32(orig_eax, orig_ax);
1018 R32(eip, ip);
1019 R32(esp, sp);
1020
1021 case offsetof(struct user32, regs.eflags):
1022 return set_flags(child, value);
1023
1024 case offsetof(struct user32, u_debugreg[0]) ...
1025 offsetof(struct user32, u_debugreg[7]):
1026 regno -= offsetof(struct user32, u_debugreg[0]);
1027 return ptrace_set_debugreg(child, regno / 4, value);
1028
1029 default:
1030 if (regno > sizeof(struct user32) || (regno & 3))
1031 return -EIO;
1032
1033 /*
1034 * Other dummy fields in the virtual user structure
1035 * are ignored
1036 */
1037 break;
1038 }
1039 return 0;
1040}
1041
1042#undef R32
1043#undef SEG32
1044
1045#define R32(l,q) \
1046 case offsetof(struct user32, regs.l): \
1047 *val = regs->q; break
1048
1049#define SEG32(rs) \
1050 case offsetof(struct user32, regs.rs): \
1051 *val = get_segment_reg(child, \
1052 offsetof(struct user_regs_struct, rs)); \
1053 break
1054
1055static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
1056{
1057 struct pt_regs *regs = task_pt_regs(child);
1058
1059 switch (regno) {
1060
1061 SEG32(ds);
1062 SEG32(es);
1063 SEG32(fs);
1064 SEG32(gs);
1065
1066 R32(cs, cs);
1067 R32(ss, ss);
1068 R32(ebx, bx);
1069 R32(ecx, cx);
1070 R32(edx, dx);
1071 R32(edi, di);
1072 R32(esi, si);
1073 R32(ebp, bp);
1074 R32(eax, ax);
1075 R32(orig_eax, orig_ax);
1076 R32(eip, ip);
1077 R32(esp, sp);
1078
1079 case offsetof(struct user32, regs.eflags):
1080 *val = get_flags(child);
1081 break;
1082
1083 case offsetof(struct user32, u_debugreg[0]) ...
1084 offsetof(struct user32, u_debugreg[7]):
1085 regno -= offsetof(struct user32, u_debugreg[0]);
1086 *val = ptrace_get_debugreg(child, regno / 4);
1087 break;
1088
1089 default:
1090 if (regno > sizeof(struct user32) || (regno & 3))
1091 return -EIO;
1092
1093 /*
1094 * Other dummy fields in the virtual user structure
1095 * are ignored
1096 */
1097 *val = 0;
1098 break;
1099 }
1100 return 0;
1101}
1102
1103#undef R32
1104#undef SEG32
1105
1106static int genregs32_get(struct task_struct *target,
1107 const struct user_regset *regset,
1108 unsigned int pos, unsigned int count,
1109 void *kbuf, void __user *ubuf)
1110{
1111 if (kbuf) {
1112 compat_ulong_t *k = kbuf;
1113 while (count > 0) {
1114 getreg32(target, pos, k++);
1115 count -= sizeof(*k);
1116 pos += sizeof(*k);
1117 }
1118 } else {
1119 compat_ulong_t __user *u = ubuf;
1120 while (count > 0) {
1121 compat_ulong_t word;
1122 getreg32(target, pos, &word);
1123 if (__put_user(word, u++))
1124 return -EFAULT;
1125 count -= sizeof(*u);
1126 pos += sizeof(*u);
1127 }
1128 }
1129
1130 return 0;
1131}
1132
1133static int genregs32_set(struct task_struct *target,
1134 const struct user_regset *regset,
1135 unsigned int pos, unsigned int count,
1136 const void *kbuf, const void __user *ubuf)
1137{
1138 int ret = 0;
1139 if (kbuf) {
1140 const compat_ulong_t *k = kbuf;
1141 while (count > 0 && !ret) {
1142 ret = putreg(target, pos, *k++);
1143 count -= sizeof(*k);
1144 pos += sizeof(*k);
1145 }
1146 } else {
1147 const compat_ulong_t __user *u = ubuf;
1148 while (count > 0 && !ret) {
1149 compat_ulong_t word;
1150 ret = __get_user(word, u++);
1151 if (ret)
1152 break;
1153 ret = putreg(target, pos, word);
1154 count -= sizeof(*u);
1155 pos += sizeof(*u);
1156 }
1157 }
1158 return ret;
1159}
1160
1161static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
1162{
1163 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
1164 compat_siginfo_t __user *si32 = compat_ptr(data);
1165 siginfo_t ssi;
1166 int ret;
1167
1168 if (request == PTRACE_SETSIGINFO) {
1169 memset(&ssi, 0, sizeof(siginfo_t));
1170 ret = copy_siginfo_from_user32(&ssi, si32);
1171 if (ret)
1172 return ret;
1173 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
1174 return -EFAULT;
1175 }
1176 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
1177 if (ret)
1178 return ret;
1179 if (request == PTRACE_GETSIGINFO) {
1180 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
1181 return -EFAULT;
1182 ret = copy_siginfo_to_user32(si32, &ssi);
1183 }
1184 return ret;
1185}
1186
1187asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
1188{
1189 struct task_struct *child;
1190 struct pt_regs *childregs;
1191 void __user *datap = compat_ptr(data);
1192 int ret;
1193 __u32 val;
1194
1195 switch (request) {
1196 case PTRACE_TRACEME:
1197 case PTRACE_ATTACH:
1198 case PTRACE_KILL:
1199 case PTRACE_CONT:
1200 case PTRACE_SINGLESTEP:
1201 case PTRACE_SINGLEBLOCK:
1202 case PTRACE_DETACH:
1203 case PTRACE_SYSCALL:
1204 case PTRACE_OLDSETOPTIONS:
1205 case PTRACE_SETOPTIONS:
1206 case PTRACE_SET_THREAD_AREA:
1207 case PTRACE_GET_THREAD_AREA:
1208 case PTRACE_BTS_CONFIG:
1209 case PTRACE_BTS_STATUS:
1210 case PTRACE_BTS_SIZE:
1211 case PTRACE_BTS_GET:
1212 case PTRACE_BTS_CLEAR:
1213 case PTRACE_BTS_DRAIN:
1214 return sys_ptrace(request, pid, addr, data);
1215
1216 default:
1217 return -EINVAL;
1218
1219 case PTRACE_PEEKTEXT:
1220 case PTRACE_PEEKDATA:
1221 case PTRACE_POKEDATA:
1222 case PTRACE_POKETEXT:
1223 case PTRACE_POKEUSR:
1224 case PTRACE_PEEKUSR:
1225 case PTRACE_GETREGS:
1226 case PTRACE_SETREGS:
1227 case PTRACE_SETFPREGS:
1228 case PTRACE_GETFPREGS:
1229 case PTRACE_SETFPXREGS:
1230 case PTRACE_GETFPXREGS:
1231 case PTRACE_GETEVENTMSG:
1232 break;
1233
1234 case PTRACE_SETSIGINFO:
1235 case PTRACE_GETSIGINFO:
1236 return ptrace32_siginfo(request, pid, addr, data);
1237 }
1238
1239 child = ptrace_get_task_struct(pid);
1240 if (IS_ERR(child))
1241 return PTR_ERR(child);
1242
1243 ret = ptrace_check_attach(child, request == PTRACE_KILL);
1244 if (ret < 0)
1245 goto out;
1246
1247 childregs = task_pt_regs(child);
1248
1249 switch (request) {
1250 case PTRACE_PEEKUSR:
1251 ret = getreg32(child, addr, &val);
1252 if (ret == 0)
1253 ret = put_user(val, (__u32 __user *)datap);
1254 break;
1255
1256 case PTRACE_POKEUSR:
1257 ret = putreg32(child, addr, data);
1258 break;
1259
1260 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1261 return copy_regset_to_user(child, &user_x86_32_view,
1262 REGSET_GENERAL,
1263 0, sizeof(struct user_regs_struct32),
1264 datap);
1265
1266 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1267 return copy_regset_from_user(child, &user_x86_32_view,
1268 REGSET_GENERAL, 0,
1269 sizeof(struct user_regs_struct32),
1270 datap);
1271
1272 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1273 return copy_regset_to_user(child, &user_x86_32_view,
1274 REGSET_FP, 0,
1275 sizeof(struct user_i387_ia32_struct),
1276 datap);
1277
1278 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1279 return copy_regset_from_user(
1280 child, &user_x86_32_view, REGSET_FP,
1281 0, sizeof(struct user_i387_ia32_struct), datap);
1282
1283 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
1284 return copy_regset_to_user(child, &user_x86_32_view,
1285 REGSET_XFP, 0,
1286 sizeof(struct user32_fxsr_struct),
1287 datap);
1288
1289 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
1290 return copy_regset_from_user(child, &user_x86_32_view,
1291 REGSET_XFP, 0,
1292 sizeof(struct user32_fxsr_struct),
1293 datap);
1294
1295 default:
1296 return compat_ptrace_request(child, request, addr, data);
1297 }
1298
1299 out:
1300 put_task_struct(child);
1301 return ret;
1302}
1303
1304#endif /* CONFIG_IA32_EMULATION */
1305
1306#ifdef CONFIG_X86_64
1307
1308static const struct user_regset x86_64_regsets[] = {
1309 [REGSET_GENERAL] = {
1310 .core_note_type = NT_PRSTATUS,
1311 .n = sizeof(struct user_regs_struct) / sizeof(long),
1312 .size = sizeof(long), .align = sizeof(long),
1313 .get = genregs_get, .set = genregs_set
1314 },
1315 [REGSET_FP] = {
1316 .core_note_type = NT_PRFPREG,
1317 .n = sizeof(struct user_i387_struct) / sizeof(long),
1318 .size = sizeof(long), .align = sizeof(long),
1319 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1320 },
1321};
1322
1323static const struct user_regset_view user_x86_64_view = {
1324 .name = "x86_64", .e_machine = EM_X86_64,
1325 .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
1326};
1327
1328#else /* CONFIG_X86_32 */
1329
1330#define user_regs_struct32 user_regs_struct
1331#define genregs32_get genregs_get
1332#define genregs32_set genregs_set
1333
1334#endif /* CONFIG_X86_64 */
1335
1336#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1337static const struct user_regset x86_32_regsets[] = {
1338 [REGSET_GENERAL] = {
1339 .core_note_type = NT_PRSTATUS,
1340 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
1341 .size = sizeof(u32), .align = sizeof(u32),
1342 .get = genregs32_get, .set = genregs32_set
1343 },
1344 [REGSET_FP] = {
1345 .core_note_type = NT_PRFPREG,
1346 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1347 .size = sizeof(u32), .align = sizeof(u32),
1348 .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
1349 },
1350 [REGSET_XFP] = {
1351 .core_note_type = NT_PRXFPREG,
1352 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1353 .size = sizeof(u32), .align = sizeof(u32),
1354 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1355 },
1356 [REGSET_TLS] = {
1357 .core_note_type = NT_386_TLS,
1358 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
1359 .size = sizeof(struct user_desc),
1360 .align = sizeof(struct user_desc),
1361 .active = regset_tls_active,
1362 .get = regset_tls_get, .set = regset_tls_set
1363 },
1364};
1365
1366static const struct user_regset_view user_x86_32_view = {
1367 .name = "i386", .e_machine = EM_386,
1368 .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
1369};
1370#endif
1371
1372const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1373{
1374#ifdef CONFIG_IA32_EMULATION
1375 if (test_tsk_thread_flag(task, TIF_IA32))
1376#endif
1377#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1378 return &user_x86_32_view;
1379#endif
1380#ifdef CONFIG_X86_64
1381 return &user_x86_64_view;
1382#endif
1383}
1384
1385#ifdef CONFIG_X86_32
1386
1387void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1388{
1389 struct siginfo info;
1390
1391 tsk->thread.trap_no = 1;
1392 tsk->thread.error_code = error_code;
1393
1394 memset(&info, 0, sizeof(info));
1395 info.si_signo = SIGTRAP;
1396 info.si_code = TRAP_BRKPT;
1397
1398 /* User-mode ip? */
1399 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
1400
1401 /* Send us the fake SIGTRAP */
1402 force_sig_info(SIGTRAP, &info, tsk);
1403}
1404
1405/* notification of system call entry/exit
1406 * - triggered by current->work.syscall_trace
1407 */
1408__attribute__((regparm(3)))
1409int do_syscall_trace(struct pt_regs *regs, int entryexit)
1410{
1411 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1412 /*
1413 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1414 * interception
1415 */
1416 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1417 int ret = 0;
1418
1419 /* do the secure computing check first */
1420 if (!entryexit)
1421 secure_computing(regs->orig_ax);
1422
1423 if (unlikely(current->audit_context)) {
1424 if (entryexit)
1425 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1426 regs->ax);
1427 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1428 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1429 * not used, entry.S will call us only on syscall exit, not
1430 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1431 * calling send_sigtrap() on syscall entry.
1432 *
1433 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1434 * is_singlestep is false, despite his name, so we will still do
1435 * the correct thing.
1436 */
1437 else if (is_singlestep)
1438 goto out;
1439 }
1440
1441 if (!(current->ptrace & PT_PTRACED))
1442 goto out;
1443
1444 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1445 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1446 * here. We have to check this and return */
1447 if (is_sysemu && entryexit)
1448 return 0;
1449
1450 /* Fake a debug trap */
1451 if (is_singlestep)
1452 send_sigtrap(current, regs, 0);
1453
1454 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1455 goto out;
1456
1457 /* the 0x80 provides a way for the tracing parent to distinguish
1458 between a syscall stop and SIGTRAP delivery */
1459 /* Note that the debugger could change the result of test_thread_flag!*/
1460 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1461
1462 /*
1463 * this isn't the same as continuing with a signal, but it will do
1464 * for normal use. strace only continues with a signal if the
1465 * stopping signal is not SIGTRAP. -brl
1466 */
1467 if (current->exit_code) {
1468 send_sig(current->exit_code, current, 1);
1469 current->exit_code = 0;
1470 }
1471 ret = is_sysemu;
1472out:
1473 if (unlikely(current->audit_context) && !entryexit)
1474 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1475 regs->bx, regs->cx, regs->dx, regs->si);
1476 if (ret == 0)
1477 return 0;
1478
1479 regs->orig_ax = -1; /* force skip of syscall restarting */
1480 if (unlikely(current->audit_context))
1481 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1482 return 1;
1483}
1484
1485#else /* CONFIG_X86_64 */
1486
1487static void syscall_trace(struct pt_regs *regs)
1488{
1489
1490#if 0
1491 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1492 current->comm,
1493 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1494 current_thread_info()->flags, current->ptrace);
1495#endif
1496
1497 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1498 ? 0x80 : 0));
1499 /*
1500 * this isn't the same as continuing with a signal, but it will do
1501 * for normal use. strace only continues with a signal if the
1502 * stopping signal is not SIGTRAP. -brl
1503 */
1504 if (current->exit_code) {
1505 send_sig(current->exit_code, current, 1);
1506 current->exit_code = 0;
1507 }
1508}
1509
1510asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1511{
1512 /* do the secure computing check first */
1513 secure_computing(regs->orig_ax);
1514
1515 if (test_thread_flag(TIF_SYSCALL_TRACE)
1516 && (current->ptrace & PT_PTRACED))
1517 syscall_trace(regs);
1518
1519 if (unlikely(current->audit_context)) {
1520 if (test_thread_flag(TIF_IA32)) {
1521 audit_syscall_entry(AUDIT_ARCH_I386,
1522 regs->orig_ax,
1523 regs->bx, regs->cx,
1524 regs->dx, regs->si);
1525 } else {
1526 audit_syscall_entry(AUDIT_ARCH_X86_64,
1527 regs->orig_ax,
1528 regs->di, regs->si,
1529 regs->dx, regs->r10);
1530 }
1531 }
1532}
1533
1534asmlinkage void syscall_trace_leave(struct pt_regs *regs)
1535{
1536 if (unlikely(current->audit_context))
1537 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1538
1539 if ((test_thread_flag(TIF_SYSCALL_TRACE)
1540 || test_thread_flag(TIF_SINGLESTEP))
1541 && (current->ptrace & PT_PTRACED))
1542 syscall_trace(regs);
1543}
1544
1545#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
deleted file mode 100644
index ff5431cc03ee..000000000000
--- a/arch/x86/kernel/ptrace_32.c
+++ /dev/null
@@ -1,717 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 */
6
7#include <linux/kernel.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/smp.h>
11#include <linux/errno.h>
12#include <linux/ptrace.h>
13#include <linux/user.h>
14#include <linux/security.h>
15#include <linux/audit.h>
16#include <linux/seccomp.h>
17#include <linux/signal.h>
18
19#include <asm/uaccess.h>
20#include <asm/pgtable.h>
21#include <asm/system.h>
22#include <asm/processor.h>
23#include <asm/i387.h>
24#include <asm/debugreg.h>
25#include <asm/ldt.h>
26#include <asm/desc.h>
27
28/*
29 * does not yet catch signals sent when the child dies.
30 * in exit.c or in signal.c.
31 */
32
33/*
34 * Determines which flags the user has access to [1 = access, 0 = no access].
35 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
36 * Also masks reserved bits (31-22, 15, 5, 3, 1).
37 */
38#define FLAG_MASK 0x00050dd5
39
40/* set's the trap flag. */
41#define TRAP_FLAG 0x100
42
43/*
44 * Offset of eflags on child stack..
45 */
46#define EFL_OFFSET offsetof(struct pt_regs, eflags)
47
48static inline struct pt_regs *get_child_regs(struct task_struct *task)
49{
50 void *stack_top = (void *)task->thread.esp0;
51 return stack_top - sizeof(struct pt_regs);
52}
53
54/*
55 * This routine will get a word off of the processes privileged stack.
56 * the offset is bytes into the pt_regs structure on the stack.
57 * This routine assumes that all the privileged stacks are in our
58 * data space.
59 */
60static inline int get_stack_long(struct task_struct *task, int offset)
61{
62 unsigned char *stack;
63
64 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
65 stack += offset;
66 return (*((int *)stack));
67}
68
69/*
70 * This routine will put a word on the processes privileged stack.
71 * the offset is bytes into the pt_regs structure on the stack.
72 * This routine assumes that all the privileged stacks are in our
73 * data space.
74 */
75static inline int put_stack_long(struct task_struct *task, int offset,
76 unsigned long data)
77{
78 unsigned char * stack;
79
80 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
81 stack += offset;
82 *(unsigned long *) stack = data;
83 return 0;
84}
85
86static int putreg(struct task_struct *child,
87 unsigned long regno, unsigned long value)
88{
89 switch (regno >> 2) {
90 case GS:
91 if (value && (value & 3) != 3)
92 return -EIO;
93 child->thread.gs = value;
94 return 0;
95 case DS:
96 case ES:
97 case FS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 value &= 0xffff;
101 break;
102 case SS:
103 case CS:
104 if ((value & 3) != 3)
105 return -EIO;
106 value &= 0xffff;
107 break;
108 case EFL:
109 value &= FLAG_MASK;
110 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
111 break;
112 }
113 if (regno > FS*4)
114 regno -= 1*4;
115 put_stack_long(child, regno, value);
116 return 0;
117}
118
119static unsigned long getreg(struct task_struct *child,
120 unsigned long regno)
121{
122 unsigned long retval = ~0UL;
123
124 switch (regno >> 2) {
125 case GS:
126 retval = child->thread.gs;
127 break;
128 case DS:
129 case ES:
130 case FS:
131 case SS:
132 case CS:
133 retval = 0xffff;
134 /* fall through */
135 default:
136 if (regno > FS*4)
137 regno -= 1*4;
138 retval &= get_stack_long(child, regno);
139 }
140 return retval;
141}
142
143#define LDT_SEGMENT 4
144
145static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
146{
147 unsigned long addr, seg;
148
149 addr = regs->eip;
150 seg = regs->xcs & 0xffff;
151 if (regs->eflags & VM_MASK) {
152 addr = (addr & 0xffff) + (seg << 4);
153 return addr;
154 }
155
156 /*
157 * We'll assume that the code segments in the GDT
158 * are all zero-based. That is largely true: the
159 * TLS segments are used for data, and the PNPBIOS
160 * and APM bios ones we just ignore here.
161 */
162 if (seg & LDT_SEGMENT) {
163 u32 *desc;
164 unsigned long base;
165
166 seg &= ~7UL;
167
168 mutex_lock(&child->mm->context.lock);
169 if (unlikely((seg >> 3) >= child->mm->context.size))
170 addr = -1L; /* bogus selector, access would fault */
171 else {
172 desc = child->mm->context.ldt + seg;
173 base = ((desc[0] >> 16) |
174 ((desc[1] & 0xff) << 16) |
175 (desc[1] & 0xff000000));
176
177 /* 16-bit code segment? */
178 if (!((desc[1] >> 22) & 1))
179 addr &= 0xffff;
180 addr += base;
181 }
182 mutex_unlock(&child->mm->context.lock);
183 }
184 return addr;
185}
186
187static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
188{
189 int i, copied;
190 unsigned char opcode[15];
191 unsigned long addr = convert_eip_to_linear(child, regs);
192
193 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
194 for (i = 0; i < copied; i++) {
195 switch (opcode[i]) {
196 /* popf and iret */
197 case 0x9d: case 0xcf:
198 return 1;
199 /* opcode and address size prefixes */
200 case 0x66: case 0x67:
201 continue;
202 /* irrelevant prefixes (segment overrides and repeats) */
203 case 0x26: case 0x2e:
204 case 0x36: case 0x3e:
205 case 0x64: case 0x65:
206 case 0xf0: case 0xf2: case 0xf3:
207 continue;
208
209 /*
210 * pushf: NOTE! We should probably not let
211 * the user see the TF bit being set. But
212 * it's more pain than it's worth to avoid
213 * it, and a debugger could emulate this
214 * all in user space if it _really_ cares.
215 */
216 case 0x9c:
217 default:
218 return 0;
219 }
220 }
221 return 0;
222}
223
224static void set_singlestep(struct task_struct *child)
225{
226 struct pt_regs *regs = get_child_regs(child);
227
228 /*
229 * Always set TIF_SINGLESTEP - this guarantees that
230 * we single-step system calls etc.. This will also
231 * cause us to set TF when returning to user mode.
232 */
233 set_tsk_thread_flag(child, TIF_SINGLESTEP);
234
235 /*
236 * If TF was already set, don't do anything else
237 */
238 if (regs->eflags & TRAP_FLAG)
239 return;
240
241 /* Set TF on the kernel stack.. */
242 regs->eflags |= TRAP_FLAG;
243
244 /*
245 * ..but if TF is changed by the instruction we will trace,
246 * don't mark it as being "us" that set it, so that we
247 * won't clear it by hand later.
248 */
249 if (is_setting_trap_flag(child, regs))
250 return;
251
252 child->ptrace |= PT_DTRACE;
253}
254
255static void clear_singlestep(struct task_struct *child)
256{
257 /* Always clear TIF_SINGLESTEP... */
258 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
259
260 /* But touch TF only if it was set by us.. */
261 if (child->ptrace & PT_DTRACE) {
262 struct pt_regs *regs = get_child_regs(child);
263 regs->eflags &= ~TRAP_FLAG;
264 child->ptrace &= ~PT_DTRACE;
265 }
266}
267
268/*
269 * Called by kernel/ptrace.c when detaching..
270 *
271 * Make sure the single step bit is not set.
272 */
273void ptrace_disable(struct task_struct *child)
274{
275 clear_singlestep(child);
276 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
277}
278
279/*
280 * Perform get_thread_area on behalf of the traced child.
281 */
282static int
283ptrace_get_thread_area(struct task_struct *child,
284 int idx, struct user_desc __user *user_desc)
285{
286 struct user_desc info;
287 struct desc_struct *desc;
288
289/*
290 * Get the current Thread-Local Storage area:
291 */
292
293#define GET_BASE(desc) ( \
294 (((desc)->a >> 16) & 0x0000ffff) | \
295 (((desc)->b << 16) & 0x00ff0000) | \
296 ( (desc)->b & 0xff000000) )
297
298#define GET_LIMIT(desc) ( \
299 ((desc)->a & 0x0ffff) | \
300 ((desc)->b & 0xf0000) )
301
302#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
303#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
304#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
305#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
306#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
307#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
308
309 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
310 return -EINVAL;
311
312 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
313
314 info.entry_number = idx;
315 info.base_addr = GET_BASE(desc);
316 info.limit = GET_LIMIT(desc);
317 info.seg_32bit = GET_32BIT(desc);
318 info.contents = GET_CONTENTS(desc);
319 info.read_exec_only = !GET_WRITABLE(desc);
320 info.limit_in_pages = GET_LIMIT_PAGES(desc);
321 info.seg_not_present = !GET_PRESENT(desc);
322 info.useable = GET_USEABLE(desc);
323
324 if (copy_to_user(user_desc, &info, sizeof(info)))
325 return -EFAULT;
326
327 return 0;
328}
329
330/*
331 * Perform set_thread_area on behalf of the traced child.
332 */
333static int
334ptrace_set_thread_area(struct task_struct *child,
335 int idx, struct user_desc __user *user_desc)
336{
337 struct user_desc info;
338 struct desc_struct *desc;
339
340 if (copy_from_user(&info, user_desc, sizeof(info)))
341 return -EFAULT;
342
343 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
344 return -EINVAL;
345
346 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
347 if (LDT_empty(&info)) {
348 desc->a = 0;
349 desc->b = 0;
350 } else {
351 desc->a = LDT_entry_a(&info);
352 desc->b = LDT_entry_b(&info);
353 }
354
355 return 0;
356}
357
358long arch_ptrace(struct task_struct *child, long request, long addr, long data)
359{
360 struct user * dummy = NULL;
361 int i, ret;
362 unsigned long __user *datap = (unsigned long __user *)data;
363
364 switch (request) {
365 /* when I and D space are separate, these will need to be fixed. */
366 case PTRACE_PEEKTEXT: /* read word at location addr. */
367 case PTRACE_PEEKDATA:
368 ret = generic_ptrace_peekdata(child, addr, data);
369 break;
370
371 /* read the word at location addr in the USER area. */
372 case PTRACE_PEEKUSR: {
373 unsigned long tmp;
374
375 ret = -EIO;
376 if ((addr & 3) || addr < 0 ||
377 addr > sizeof(struct user) - 3)
378 break;
379
380 tmp = 0; /* Default return condition */
381 if(addr < FRAME_SIZE*sizeof(long))
382 tmp = getreg(child, addr);
383 if(addr >= (long) &dummy->u_debugreg[0] &&
384 addr <= (long) &dummy->u_debugreg[7]){
385 addr -= (long) &dummy->u_debugreg[0];
386 addr = addr >> 2;
387 tmp = child->thread.debugreg[addr];
388 }
389 ret = put_user(tmp, datap);
390 break;
391 }
392
393 /* when I and D space are separate, this will have to be fixed. */
394 case PTRACE_POKETEXT: /* write the word at location addr. */
395 case PTRACE_POKEDATA:
396 ret = generic_ptrace_pokedata(child, addr, data);
397 break;
398
399 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
400 ret = -EIO;
401 if ((addr & 3) || addr < 0 ||
402 addr > sizeof(struct user) - 3)
403 break;
404
405 if (addr < FRAME_SIZE*sizeof(long)) {
406 ret = putreg(child, addr, data);
407 break;
408 }
409 /* We need to be very careful here. We implicitly
410 want to modify a portion of the task_struct, and we
411 have to be selective about what portions we allow someone
412 to modify. */
413
414 ret = -EIO;
415 if(addr >= (long) &dummy->u_debugreg[0] &&
416 addr <= (long) &dummy->u_debugreg[7]){
417
418 if(addr == (long) &dummy->u_debugreg[4]) break;
419 if(addr == (long) &dummy->u_debugreg[5]) break;
420 if(addr < (long) &dummy->u_debugreg[4] &&
421 ((unsigned long) data) >= TASK_SIZE-3) break;
422
423 /* Sanity-check data. Take one half-byte at once with
424 * check = (val >> (16 + 4*i)) & 0xf. It contains the
425 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
426 * 2 and 3 are LENi. Given a list of invalid values,
427 * we do mask |= 1 << invalid_value, so that
428 * (mask >> check) & 1 is a correct test for invalid
429 * values.
430 *
431 * R/Wi contains the type of the breakpoint /
432 * watchpoint, LENi contains the length of the watched
433 * data in the watchpoint case.
434 *
435 * The invalid values are:
436 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
437 * - R/Wi == 0x10 (break on I/O reads or writes), so
438 * mask |= 0x4444.
439 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
440 * 0x1110.
441 *
442 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
443 *
444 * See the Intel Manual "System Programming Guide",
445 * 15.2.4
446 *
447 * Note that LENi == 0x10 is defined on x86_64 in long
448 * mode (i.e. even for 32-bit userspace software, but
449 * 64-bit kernel), so the x86_64 mask value is 0x5454.
450 * See the AMD manual no. 24593 (AMD64 System
451 * Programming)*/
452
453 if(addr == (long) &dummy->u_debugreg[7]) {
454 data &= ~DR_CONTROL_RESERVED;
455 for(i=0; i<4; i++)
456 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
457 goto out_tsk;
458 if (data)
459 set_tsk_thread_flag(child, TIF_DEBUG);
460 else
461 clear_tsk_thread_flag(child, TIF_DEBUG);
462 }
463 addr -= (long) &dummy->u_debugreg;
464 addr = addr >> 2;
465 child->thread.debugreg[addr] = data;
466 ret = 0;
467 }
468 break;
469
470 case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
471 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
472 case PTRACE_CONT: /* restart after signal. */
473 ret = -EIO;
474 if (!valid_signal(data))
475 break;
476 if (request == PTRACE_SYSEMU) {
477 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
478 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
479 } else if (request == PTRACE_SYSCALL) {
480 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
481 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
482 } else {
483 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
484 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
485 }
486 child->exit_code = data;
487 /* make sure the single step bit is not set. */
488 clear_singlestep(child);
489 wake_up_process(child);
490 ret = 0;
491 break;
492
493/*
494 * make the child exit. Best I can do is send it a sigkill.
495 * perhaps it should be put in the status that it wants to
496 * exit.
497 */
498 case PTRACE_KILL:
499 ret = 0;
500 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
501 break;
502 child->exit_code = SIGKILL;
503 /* make sure the single step bit is not set. */
504 clear_singlestep(child);
505 wake_up_process(child);
506 break;
507
508 case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
509 case PTRACE_SINGLESTEP: /* set the trap flag. */
510 ret = -EIO;
511 if (!valid_signal(data))
512 break;
513
514 if (request == PTRACE_SYSEMU_SINGLESTEP)
515 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
516 else
517 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
518
519 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
520 set_singlestep(child);
521 child->exit_code = data;
522 /* give it a chance to run. */
523 wake_up_process(child);
524 ret = 0;
525 break;
526
527 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
528 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
529 ret = -EIO;
530 break;
531 }
532 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
533 __put_user(getreg(child, i), datap);
534 datap++;
535 }
536 ret = 0;
537 break;
538 }
539
540 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
541 unsigned long tmp;
542 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
543 ret = -EIO;
544 break;
545 }
546 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
547 __get_user(tmp, datap);
548 putreg(child, i, tmp);
549 datap++;
550 }
551 ret = 0;
552 break;
553 }
554
555 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
556 if (!access_ok(VERIFY_WRITE, datap,
557 sizeof(struct user_i387_struct))) {
558 ret = -EIO;
559 break;
560 }
561 ret = 0;
562 if (!tsk_used_math(child))
563 init_fpu(child);
564 get_fpregs((struct user_i387_struct __user *)data, child);
565 break;
566 }
567
568 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
569 if (!access_ok(VERIFY_READ, datap,
570 sizeof(struct user_i387_struct))) {
571 ret = -EIO;
572 break;
573 }
574 set_stopped_child_used_math(child);
575 set_fpregs(child, (struct user_i387_struct __user *)data);
576 ret = 0;
577 break;
578 }
579
580 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
581 if (!access_ok(VERIFY_WRITE, datap,
582 sizeof(struct user_fxsr_struct))) {
583 ret = -EIO;
584 break;
585 }
586 if (!tsk_used_math(child))
587 init_fpu(child);
588 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
589 break;
590 }
591
592 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
593 if (!access_ok(VERIFY_READ, datap,
594 sizeof(struct user_fxsr_struct))) {
595 ret = -EIO;
596 break;
597 }
598 set_stopped_child_used_math(child);
599 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
600 break;
601 }
602
603 case PTRACE_GET_THREAD_AREA:
604 ret = ptrace_get_thread_area(child, addr,
605 (struct user_desc __user *) data);
606 break;
607
608 case PTRACE_SET_THREAD_AREA:
609 ret = ptrace_set_thread_area(child, addr,
610 (struct user_desc __user *) data);
611 break;
612
613 default:
614 ret = ptrace_request(child, request, addr, data);
615 break;
616 }
617 out_tsk:
618 return ret;
619}
620
621void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
622{
623 struct siginfo info;
624
625 tsk->thread.trap_no = 1;
626 tsk->thread.error_code = error_code;
627
628 memset(&info, 0, sizeof(info));
629 info.si_signo = SIGTRAP;
630 info.si_code = TRAP_BRKPT;
631
632 /* User-mode eip? */
633 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
634
635 /* Send us the fake SIGTRAP */
636 force_sig_info(SIGTRAP, &info, tsk);
637}
638
639/* notification of system call entry/exit
640 * - triggered by current->work.syscall_trace
641 */
642__attribute__((regparm(3)))
643int do_syscall_trace(struct pt_regs *regs, int entryexit)
644{
645 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
646 /*
647 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
648 * interception
649 */
650 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
651 int ret = 0;
652
653 /* do the secure computing check first */
654 if (!entryexit)
655 secure_computing(regs->orig_eax);
656
657 if (unlikely(current->audit_context)) {
658 if (entryexit)
659 audit_syscall_exit(AUDITSC_RESULT(regs->eax),
660 regs->eax);
661 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
662 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
663 * not used, entry.S will call us only on syscall exit, not
664 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
665 * calling send_sigtrap() on syscall entry.
666 *
667 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
668 * is_singlestep is false, despite his name, so we will still do
669 * the correct thing.
670 */
671 else if (is_singlestep)
672 goto out;
673 }
674
675 if (!(current->ptrace & PT_PTRACED))
676 goto out;
677
678 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
679 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
680 * here. We have to check this and return */
681 if (is_sysemu && entryexit)
682 return 0;
683
684 /* Fake a debug trap */
685 if (is_singlestep)
686 send_sigtrap(current, regs, 0);
687
688 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
689 goto out;
690
691 /* the 0x80 provides a way for the tracing parent to distinguish
692 between a syscall stop and SIGTRAP delivery */
693 /* Note that the debugger could change the result of test_thread_flag!*/
694 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
695
696 /*
697 * this isn't the same as continuing with a signal, but it will do
698 * for normal use. strace only continues with a signal if the
699 * stopping signal is not SIGTRAP. -brl
700 */
701 if (current->exit_code) {
702 send_sig(current->exit_code, current, 1);
703 current->exit_code = 0;
704 }
705 ret = is_sysemu;
706out:
707 if (unlikely(current->audit_context) && !entryexit)
708 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
709 regs->ebx, regs->ecx, regs->edx, regs->esi);
710 if (ret == 0)
711 return 0;
712
713 regs->orig_eax = -1; /* force skip of syscall restarting */
714 if (unlikely(current->audit_context))
715 audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
716 return 1;
717}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
deleted file mode 100644
index 607085f3f08a..000000000000
--- a/arch/x86/kernel/ptrace_64.c
+++ /dev/null
@@ -1,621 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * x86-64 port 2000-2002 Andi Kleen
7 */
8
9#include <linux/kernel.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/errno.h>
14#include <linux/ptrace.h>
15#include <linux/user.h>
16#include <linux/security.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19#include <linux/signal.h>
20
21#include <asm/uaccess.h>
22#include <asm/pgtable.h>
23#include <asm/system.h>
24#include <asm/processor.h>
25#include <asm/i387.h>
26#include <asm/debugreg.h>
27#include <asm/ldt.h>
28#include <asm/desc.h>
29#include <asm/proto.h>
30#include <asm/ia32.h>
31
32/*
33 * does not yet catch signals sent when the child dies.
34 * in exit.c or in signal.c.
35 */
36
37/*
38 * Determines which flags the user has access to [1 = access, 0 = no access].
39 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
40 * Also masks reserved bits (63-22, 15, 5, 3, 1).
41 */
42#define FLAG_MASK 0x54dd5UL
43
44/* set's the trap flag. */
45#define TRAP_FLAG 0x100UL
46
47/*
48 * eflags and offset of eflags on child stack..
49 */
50#define EFLAGS offsetof(struct pt_regs, eflags)
51#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
52
53/*
54 * this routine will get a word off of the processes privileged stack.
55 * the offset is how far from the base addr as stored in the TSS.
56 * this routine assumes that all the privileged stacks are in our
57 * data space.
58 */
59static inline unsigned long get_stack_long(struct task_struct *task, int offset)
60{
61 unsigned char *stack;
62
63 stack = (unsigned char *)task->thread.rsp0;
64 stack += offset;
65 return (*((unsigned long *)stack));
66}
67
68/*
69 * this routine will put a word on the processes privileged stack.
70 * the offset is how far from the base addr as stored in the TSS.
71 * this routine assumes that all the privileged stacks are in our
72 * data space.
73 */
74static inline long put_stack_long(struct task_struct *task, int offset,
75 unsigned long data)
76{
77 unsigned char * stack;
78
79 stack = (unsigned char *) task->thread.rsp0;
80 stack += offset;
81 *(unsigned long *) stack = data;
82 return 0;
83}
84
85#define LDT_SEGMENT 4
86
87unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
88{
89 unsigned long addr, seg;
90
91 addr = regs->rip;
92 seg = regs->cs & 0xffff;
93
94 /*
95 * We'll assume that the code segments in the GDT
96 * are all zero-based. That is largely true: the
97 * TLS segments are used for data, and the PNPBIOS
98 * and APM bios ones we just ignore here.
99 */
100 if (seg & LDT_SEGMENT) {
101 u32 *desc;
102 unsigned long base;
103
104 seg &= ~7UL;
105
106 mutex_lock(&child->mm->context.lock);
107 if (unlikely((seg >> 3) >= child->mm->context.size))
108 addr = -1L; /* bogus selector, access would fault */
109 else {
110 desc = child->mm->context.ldt + seg;
111 base = ((desc[0] >> 16) |
112 ((desc[1] & 0xff) << 16) |
113 (desc[1] & 0xff000000));
114
115 /* 16-bit code segment? */
116 if (!((desc[1] >> 22) & 1))
117 addr &= 0xffff;
118 addr += base;
119 }
120 mutex_unlock(&child->mm->context.lock);
121 }
122
123 return addr;
124}
125
126static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
127{
128 int i, copied;
129 unsigned char opcode[15];
130 unsigned long addr = convert_rip_to_linear(child, regs);
131
132 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
133 for (i = 0; i < copied; i++) {
134 switch (opcode[i]) {
135 /* popf and iret */
136 case 0x9d: case 0xcf:
137 return 1;
138
139 /* CHECKME: 64 65 */
140
141 /* opcode and address size prefixes */
142 case 0x66: case 0x67:
143 continue;
144 /* irrelevant prefixes (segment overrides and repeats) */
145 case 0x26: case 0x2e:
146 case 0x36: case 0x3e:
147 case 0x64: case 0x65:
148 case 0xf2: case 0xf3:
149 continue;
150
151 case 0x40 ... 0x4f:
152 if (regs->cs != __USER_CS)
153 /* 32-bit mode: register increment */
154 return 0;
155 /* 64-bit mode: REX prefix */
156 continue;
157
158 /* CHECKME: f2, f3 */
159
160 /*
161 * pushf: NOTE! We should probably not let
162 * the user see the TF bit being set. But
163 * it's more pain than it's worth to avoid
164 * it, and a debugger could emulate this
165 * all in user space if it _really_ cares.
166 */
167 case 0x9c:
168 default:
169 return 0;
170 }
171 }
172 return 0;
173}
174
175static void set_singlestep(struct task_struct *child)
176{
177 struct pt_regs *regs = task_pt_regs(child);
178
179 /*
180 * Always set TIF_SINGLESTEP - this guarantees that
181 * we single-step system calls etc.. This will also
182 * cause us to set TF when returning to user mode.
183 */
184 set_tsk_thread_flag(child, TIF_SINGLESTEP);
185
186 /*
187 * If TF was already set, don't do anything else
188 */
189 if (regs->eflags & TRAP_FLAG)
190 return;
191
192 /* Set TF on the kernel stack.. */
193 regs->eflags |= TRAP_FLAG;
194
195 /*
196 * ..but if TF is changed by the instruction we will trace,
197 * don't mark it as being "us" that set it, so that we
198 * won't clear it by hand later.
199 */
200 if (is_setting_trap_flag(child, regs))
201 return;
202
203 child->ptrace |= PT_DTRACE;
204}
205
206static void clear_singlestep(struct task_struct *child)
207{
208 /* Always clear TIF_SINGLESTEP... */
209 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
210
211 /* But touch TF only if it was set by us.. */
212 if (child->ptrace & PT_DTRACE) {
213 struct pt_regs *regs = task_pt_regs(child);
214 regs->eflags &= ~TRAP_FLAG;
215 child->ptrace &= ~PT_DTRACE;
216 }
217}
218
219/*
220 * Called by kernel/ptrace.c when detaching..
221 *
222 * Make sure the single step bit is not set.
223 */
224void ptrace_disable(struct task_struct *child)
225{
226 clear_singlestep(child);
227}
228
229static int putreg(struct task_struct *child,
230 unsigned long regno, unsigned long value)
231{
232 unsigned long tmp;
233
234 switch (regno) {
235 case offsetof(struct user_regs_struct,fs):
236 if (value && (value & 3) != 3)
237 return -EIO;
238 child->thread.fsindex = value & 0xffff;
239 return 0;
240 case offsetof(struct user_regs_struct,gs):
241 if (value && (value & 3) != 3)
242 return -EIO;
243 child->thread.gsindex = value & 0xffff;
244 return 0;
245 case offsetof(struct user_regs_struct,ds):
246 if (value && (value & 3) != 3)
247 return -EIO;
248 child->thread.ds = value & 0xffff;
249 return 0;
250 case offsetof(struct user_regs_struct,es):
251 if (value && (value & 3) != 3)
252 return -EIO;
253 child->thread.es = value & 0xffff;
254 return 0;
255 case offsetof(struct user_regs_struct,ss):
256 if ((value & 3) != 3)
257 return -EIO;
258 value &= 0xffff;
259 return 0;
260 case offsetof(struct user_regs_struct,fs_base):
261 if (value >= TASK_SIZE_OF(child))
262 return -EIO;
263 child->thread.fs = value;
264 return 0;
265 case offsetof(struct user_regs_struct,gs_base):
266 if (value >= TASK_SIZE_OF(child))
267 return -EIO;
268 child->thread.gs = value;
269 return 0;
270 case offsetof(struct user_regs_struct, eflags):
271 value &= FLAG_MASK;
272 tmp = get_stack_long(child, EFL_OFFSET);
273 tmp &= ~FLAG_MASK;
274 value |= tmp;
275 break;
276 case offsetof(struct user_regs_struct,cs):
277 if ((value & 3) != 3)
278 return -EIO;
279 value &= 0xffff;
280 break;
281 }
282 put_stack_long(child, regno - sizeof(struct pt_regs), value);
283 return 0;
284}
285
286static unsigned long getreg(struct task_struct *child, unsigned long regno)
287{
288 unsigned long val;
289 switch (regno) {
290 case offsetof(struct user_regs_struct, fs):
291 return child->thread.fsindex;
292 case offsetof(struct user_regs_struct, gs):
293 return child->thread.gsindex;
294 case offsetof(struct user_regs_struct, ds):
295 return child->thread.ds;
296 case offsetof(struct user_regs_struct, es):
297 return child->thread.es;
298 case offsetof(struct user_regs_struct, fs_base):
299 return child->thread.fs;
300 case offsetof(struct user_regs_struct, gs_base):
301 return child->thread.gs;
302 default:
303 regno = regno - sizeof(struct pt_regs);
304 val = get_stack_long(child, regno);
305 if (test_tsk_thread_flag(child, TIF_IA32))
306 val &= 0xffffffff;
307 return val;
308 }
309
310}
311
312long arch_ptrace(struct task_struct *child, long request, long addr, long data)
313{
314 long i, ret;
315 unsigned ui;
316
317 switch (request) {
318 /* when I and D space are separate, these will need to be fixed. */
319 case PTRACE_PEEKTEXT: /* read word at location addr. */
320 case PTRACE_PEEKDATA:
321 ret = generic_ptrace_peekdata(child, addr, data);
322 break;
323
324 /* read the word at location addr in the USER area. */
325 case PTRACE_PEEKUSR: {
326 unsigned long tmp;
327
328 ret = -EIO;
329 if ((addr & 7) ||
330 addr > sizeof(struct user) - 7)
331 break;
332
333 switch (addr) {
334 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
335 tmp = getreg(child, addr);
336 break;
337 case offsetof(struct user, u_debugreg[0]):
338 tmp = child->thread.debugreg0;
339 break;
340 case offsetof(struct user, u_debugreg[1]):
341 tmp = child->thread.debugreg1;
342 break;
343 case offsetof(struct user, u_debugreg[2]):
344 tmp = child->thread.debugreg2;
345 break;
346 case offsetof(struct user, u_debugreg[3]):
347 tmp = child->thread.debugreg3;
348 break;
349 case offsetof(struct user, u_debugreg[6]):
350 tmp = child->thread.debugreg6;
351 break;
352 case offsetof(struct user, u_debugreg[7]):
353 tmp = child->thread.debugreg7;
354 break;
355 default:
356 tmp = 0;
357 break;
358 }
359 ret = put_user(tmp,(unsigned long __user *) data);
360 break;
361 }
362
363 /* when I and D space are separate, this will have to be fixed. */
364 case PTRACE_POKETEXT: /* write the word at location addr. */
365 case PTRACE_POKEDATA:
366 ret = generic_ptrace_pokedata(child, addr, data);
367 break;
368
369 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
370 {
371 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
372 ret = -EIO;
373 if ((addr & 7) ||
374 addr > sizeof(struct user) - 7)
375 break;
376
377 switch (addr) {
378 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
379 ret = putreg(child, addr, data);
380 break;
381 /* Disallows to set a breakpoint into the vsyscall */
382 case offsetof(struct user, u_debugreg[0]):
383 if (data >= TASK_SIZE_OF(child) - dsize) break;
384 child->thread.debugreg0 = data;
385 ret = 0;
386 break;
387 case offsetof(struct user, u_debugreg[1]):
388 if (data >= TASK_SIZE_OF(child) - dsize) break;
389 child->thread.debugreg1 = data;
390 ret = 0;
391 break;
392 case offsetof(struct user, u_debugreg[2]):
393 if (data >= TASK_SIZE_OF(child) - dsize) break;
394 child->thread.debugreg2 = data;
395 ret = 0;
396 break;
397 case offsetof(struct user, u_debugreg[3]):
398 if (data >= TASK_SIZE_OF(child) - dsize) break;
399 child->thread.debugreg3 = data;
400 ret = 0;
401 break;
402 case offsetof(struct user, u_debugreg[6]):
403 if (data >> 32)
404 break;
405 child->thread.debugreg6 = data;
406 ret = 0;
407 break;
408 case offsetof(struct user, u_debugreg[7]):
409 /* See arch/i386/kernel/ptrace.c for an explanation of
410 * this awkward check.*/
411 data &= ~DR_CONTROL_RESERVED;
412 for(i=0; i<4; i++)
413 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
414 break;
415 if (i == 4) {
416 child->thread.debugreg7 = data;
417 if (data)
418 set_tsk_thread_flag(child, TIF_DEBUG);
419 else
420 clear_tsk_thread_flag(child, TIF_DEBUG);
421 ret = 0;
422 }
423 break;
424 }
425 break;
426 }
427 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
428 case PTRACE_CONT: /* restart after signal. */
429
430 ret = -EIO;
431 if (!valid_signal(data))
432 break;
433 if (request == PTRACE_SYSCALL)
434 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
435 else
436 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
437 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
438 child->exit_code = data;
439 /* make sure the single step bit is not set. */
440 clear_singlestep(child);
441 wake_up_process(child);
442 ret = 0;
443 break;
444
445#ifdef CONFIG_IA32_EMULATION
446 /* This makes only sense with 32bit programs. Allow a
447 64bit debugger to fully examine them too. Better
448 don't use it against 64bit processes, use
449 PTRACE_ARCH_PRCTL instead. */
450 case PTRACE_SET_THREAD_AREA: {
451 struct user_desc __user *p;
452 int old;
453 p = (struct user_desc __user *)data;
454 get_user(old, &p->entry_number);
455 put_user(addr, &p->entry_number);
456 ret = do_set_thread_area(&child->thread, p);
457 put_user(old, &p->entry_number);
458 break;
459 case PTRACE_GET_THREAD_AREA:
460 p = (struct user_desc __user *)data;
461 get_user(old, &p->entry_number);
462 put_user(addr, &p->entry_number);
463 ret = do_get_thread_area(&child->thread, p);
464 put_user(old, &p->entry_number);
465 break;
466 }
467#endif
468 /* normal 64bit interface to access TLS data.
469 Works just like arch_prctl, except that the arguments
470 are reversed. */
471 case PTRACE_ARCH_PRCTL:
472 ret = do_arch_prctl(child, data, addr);
473 break;
474
475/*
476 * make the child exit. Best I can do is send it a sigkill.
477 * perhaps it should be put in the status that it wants to
478 * exit.
479 */
480 case PTRACE_KILL:
481 ret = 0;
482 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
483 break;
484 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
485 child->exit_code = SIGKILL;
486 /* make sure the single step bit is not set. */
487 clear_singlestep(child);
488 wake_up_process(child);
489 break;
490
491 case PTRACE_SINGLESTEP: /* set the trap flag. */
492 ret = -EIO;
493 if (!valid_signal(data))
494 break;
495 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
496 set_singlestep(child);
497 child->exit_code = data;
498 /* give it a chance to run. */
499 wake_up_process(child);
500 ret = 0;
501 break;
502
503 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
504 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
505 sizeof(struct user_regs_struct))) {
506 ret = -EIO;
507 break;
508 }
509 ret = 0;
510 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
511 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
512 data += sizeof(long);
513 }
514 break;
515 }
516
517 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
518 unsigned long tmp;
519 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
520 sizeof(struct user_regs_struct))) {
521 ret = -EIO;
522 break;
523 }
524 ret = 0;
525 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
526 ret = __get_user(tmp, (unsigned long __user *) data);
527 if (ret)
528 break;
529 ret = putreg(child, ui, tmp);
530 if (ret)
531 break;
532 data += sizeof(long);
533 }
534 break;
535 }
536
537 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
538 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
539 sizeof(struct user_i387_struct))) {
540 ret = -EIO;
541 break;
542 }
543 ret = get_fpregs((struct user_i387_struct __user *)data, child);
544 break;
545 }
546
547 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
548 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
549 sizeof(struct user_i387_struct))) {
550 ret = -EIO;
551 break;
552 }
553 set_stopped_child_used_math(child);
554 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
555 break;
556 }
557
558 default:
559 ret = ptrace_request(child, request, addr, data);
560 break;
561 }
562 return ret;
563}
564
565static void syscall_trace(struct pt_regs *regs)
566{
567
568#if 0
569 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
570 current->comm,
571 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
572 current_thread_info()->flags, current->ptrace);
573#endif
574
575 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
576 ? 0x80 : 0));
577 /*
578 * this isn't the same as continuing with a signal, but it will do
579 * for normal use. strace only continues with a signal if the
580 * stopping signal is not SIGTRAP. -brl
581 */
582 if (current->exit_code) {
583 send_sig(current->exit_code, current, 1);
584 current->exit_code = 0;
585 }
586}
587
588asmlinkage void syscall_trace_enter(struct pt_regs *regs)
589{
590 /* do the secure computing check first */
591 secure_computing(regs->orig_rax);
592
593 if (test_thread_flag(TIF_SYSCALL_TRACE)
594 && (current->ptrace & PT_PTRACED))
595 syscall_trace(regs);
596
597 if (unlikely(current->audit_context)) {
598 if (test_thread_flag(TIF_IA32)) {
599 audit_syscall_entry(AUDIT_ARCH_I386,
600 regs->orig_rax,
601 regs->rbx, regs->rcx,
602 regs->rdx, regs->rsi);
603 } else {
604 audit_syscall_entry(AUDIT_ARCH_X86_64,
605 regs->orig_rax,
606 regs->rdi, regs->rsi,
607 regs->rdx, regs->r10);
608 }
609 }
610}
611
612asmlinkage void syscall_trace_leave(struct pt_regs *regs)
613{
614 if (unlikely(current->audit_context))
615 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
616
617 if ((test_thread_flag(TIF_SYSCALL_TRACE)
618 || test_thread_flag(TIF_SINGLESTEP))
619 && (current->ptrace & PT_PTRACED))
620 syscall_trace(regs);
621}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index fab30e134836..150ba29a0d33 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
162 ich_force_enable_hpet); 162 ich_force_enable_hpet);
163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
164 ich_force_enable_hpet); 164 ich_force_enable_hpet);
165DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
166 ich_force_enable_hpet);
165 167
166 168
167static struct pci_dev *cached_dev; 169static struct pci_dev *cached_dev;
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c
index bb1a0f889c5e..5818dc28167d 100644
--- a/arch/x86/kernel/reboot_32.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,64 +1,94 @@
1#include <linux/mm.h>
2#include <linux/module.h> 1#include <linux/module.h>
3#include <linux/delay.h>
4#include <linux/init.h> 2#include <linux/init.h>
5#include <linux/interrupt.h>
6#include <linux/mc146818rtc.h>
7#include <linux/efi.h>
8#include <linux/dmi.h>
9#include <linux/ctype.h>
10#include <linux/pm.h>
11#include <linux/reboot.h> 3#include <linux/reboot.h>
12#include <asm/uaccess.h> 4#include <linux/init.h>
5#include <linux/pm.h>
6#include <linux/efi.h>
7#include <acpi/reboot.h>
8#include <asm/io.h>
13#include <asm/apic.h> 9#include <asm/apic.h>
14#include <asm/hpet.h>
15#include <asm/desc.h> 10#include <asm/desc.h>
16#include "mach_reboot.h" 11#include <asm/hpet.h>
17#include <asm/reboot_fixups.h> 12#include <asm/reboot_fixups.h>
18#include <asm/reboot.h> 13#include <asm/reboot.h>
19 14
15#ifdef CONFIG_X86_32
16# include <linux/dmi.h>
17# include <linux/ctype.h>
18# include <linux/mc146818rtc.h>
19# include <asm/pgtable.h>
20#else
21# include <asm/iommu.h>
22#endif
23
20/* 24/*
21 * Power off function, if any 25 * Power off function, if any
22 */ 26 */
23void (*pm_power_off)(void); 27void (*pm_power_off)(void);
24EXPORT_SYMBOL(pm_power_off); 28EXPORT_SYMBOL(pm_power_off);
25 29
30static long no_idt[3];
26static int reboot_mode; 31static int reboot_mode;
27static int reboot_thru_bios; 32enum reboot_type reboot_type = BOOT_KBD;
33int reboot_force;
28 34
29#ifdef CONFIG_SMP 35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
30static int reboot_cpu = -1; 36static int reboot_cpu = -1;
31#endif 37#endif
38
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
40 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32)
43 smp Reboot by executing reset on BSP or other CPU (only for X86_32)
44 triple Force a triple fault (init)
45 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service
48 force Avoid anything that could hang.
49 */
32static int __init reboot_setup(char *str) 50static int __init reboot_setup(char *str)
33{ 51{
34 while(1) { 52 for (;;) {
35 switch (*str) { 53 switch (*str) {
36 case 'w': /* "warm" reboot (no memory testing etc) */ 54 case 'w':
37 reboot_mode = 0x1234; 55 reboot_mode = 0x1234;
38 break; 56 break;
39 case 'c': /* "cold" reboot (with memory testing etc) */ 57
40 reboot_mode = 0x0; 58 case 'c':
41 break; 59 reboot_mode = 0;
42 case 'b': /* "bios" reboot by jumping through the BIOS */
43 reboot_thru_bios = 1;
44 break;
45 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
46 reboot_thru_bios = 0;
47 break; 60 break;
61
62#ifdef CONFIG_X86_32
48#ifdef CONFIG_SMP 63#ifdef CONFIG_SMP
49 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ 64 case 's':
50 if (isdigit(*(str+1))) { 65 if (isdigit(*(str+1))) {
51 reboot_cpu = (int) (*(str+1) - '0'); 66 reboot_cpu = (int) (*(str+1) - '0');
52 if (isdigit(*(str+2))) 67 if (isdigit(*(str+2)))
53 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); 68 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
54 } 69 }
55 /* we will leave sorting out the final value 70 /* we will leave sorting out the final value
56 when we are ready to reboot, since we might not 71 when we are ready to reboot, since we might not
57 have set up boot_cpu_id or smp_num_cpu */ 72 have set up boot_cpu_id or smp_num_cpu */
58 break; 73 break;
74#endif /* CONFIG_SMP */
75
76 case 'b':
59#endif 77#endif
78 case 'a':
79 case 'k':
80 case 't':
81 case 'e':
82 reboot_type = *str;
83 break;
84
85 case 'f':
86 reboot_force = 1;
87 break;
60 } 88 }
61 if((str = strchr(str,',')) != NULL) 89
90 str = strchr(str, ',');
91 if (str)
62 str++; 92 str++;
63 else 93 else
64 break; 94 break;
@@ -68,18 +98,21 @@ static int __init reboot_setup(char *str)
68 98
69__setup("reboot=", reboot_setup); 99__setup("reboot=", reboot_setup);
70 100
101
102#ifdef CONFIG_X86_32
71/* 103/*
72 * Reboot options and system auto-detection code provided by 104 * Reboot options and system auto-detection code provided by
73 * Dell Inc. so their systems "just work". :-) 105 * Dell Inc. so their systems "just work". :-)
74 */ 106 */
75 107
76/* 108/*
77 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. 109 * Some machines require the "reboot=b" commandline option,
110 * this quirk makes that automatic.
78 */ 111 */
79static int __init set_bios_reboot(const struct dmi_system_id *d) 112static int __init set_bios_reboot(const struct dmi_system_id *d)
80{ 113{
81 if (!reboot_thru_bios) { 114 if (reboot_type != BOOT_BIOS) {
82 reboot_thru_bios = 1; 115 reboot_type = BOOT_BIOS;
83 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 116 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
84 } 117 }
85 return 0; 118 return 0;
@@ -143,7 +176,6 @@ static int __init reboot_init(void)
143 dmi_check_system(reboot_dmi_table); 176 dmi_check_system(reboot_dmi_table);
144 return 0; 177 return 0;
145} 178}
146
147core_initcall(reboot_init); 179core_initcall(reboot_init);
148 180
149/* The following code and data reboots the machine by switching to real 181/* The following code and data reboots the machine by switching to real
@@ -152,7 +184,6 @@ core_initcall(reboot_init);
152 controller to pulse the CPU reset line, which is more thorough, but 184 controller to pulse the CPU reset line, which is more thorough, but
153 doesn't work with at least one type of 486 motherboard. It is easy 185 doesn't work with at least one type of 486 motherboard. It is easy
154 to stop this code working; hence the copious comments. */ 186 to stop this code working; hence the copious comments. */
155
156static unsigned long long 187static unsigned long long
157real_mode_gdt_entries [3] = 188real_mode_gdt_entries [3] =
158{ 189{
@@ -161,11 +192,9 @@ real_mode_gdt_entries [3] =
161 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ 192 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
162}; 193};
163 194
164static struct Xgt_desc_struct 195static struct desc_ptr
165real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, 196real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
166real_mode_idt = { 0x3ff, 0 }, 197real_mode_idt = { 0x3ff, 0 };
167no_idt = { 0, 0 };
168
169 198
170/* This is 16-bit protected mode code to disable paging and the cache, 199/* This is 16-bit protected mode code to disable paging and the cache,
171 switch to real mode and jump to the BIOS reset code. 200 switch to real mode and jump to the BIOS reset code.
@@ -185,7 +214,6 @@ no_idt = { 0, 0 };
185 214
186 More could be done here to set up the registers as if a CPU reset had 215 More could be done here to set up the registers as if a CPU reset had
187 occurred; hopefully real BIOSs don't assume much. */ 216 occurred; hopefully real BIOSs don't assume much. */
188
189static unsigned char real_mode_switch [] = 217static unsigned char real_mode_switch [] =
190{ 218{
191 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 219 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
@@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length)
223 `outb_p' is needed instead of just `outb'. Use it to be on the 251 `outb_p' is needed instead of just `outb'. Use it to be on the
224 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) 252 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
225 */ 253 */
226
227 spin_lock(&rtc_lock); 254 spin_lock(&rtc_lock);
228 CMOS_WRITE(0x00, 0x8f); 255 CMOS_WRITE(0x00, 0x8f);
229 spin_unlock(&rtc_lock); 256 spin_unlock(&rtc_lock);
@@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length)
231 /* Remap the kernel at virtual address zero, as well as offset zero 258 /* Remap the kernel at virtual address zero, as well as offset zero
232 from the kernel segment. This assumes the kernel segment starts at 259 from the kernel segment. This assumes the kernel segment starts at
233 virtual address PAGE_OFFSET. */ 260 virtual address PAGE_OFFSET. */
234 261 memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
235 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 262 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
236 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
237 263
238 /* 264 /*
239 * Use `swapper_pg_dir' as our page directory. 265 * Use `swapper_pg_dir' as our page directory.
@@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length)
245 boot)". This seems like a fairly standard thing that gets set by 271 boot)". This seems like a fairly standard thing that gets set by
246 REBOOT.COM programs, and the previous reset routine did this 272 REBOOT.COM programs, and the previous reset routine did this
247 too. */ 273 too. */
248
249 *((unsigned short *)0x472) = reboot_mode; 274 *((unsigned short *)0x472) = reboot_mode;
250 275
251 /* For the switch to real mode, copy some code to low memory. It has 276 /* For the switch to real mode, copy some code to low memory. It has
@@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length)
253 has to have the same physical and virtual address, because it turns 278 has to have the same physical and virtual address, because it turns
254 off paging. Copy it near the end of the first page, out of the way 279 off paging. Copy it near the end of the first page, out of the way
255 of BIOS variables. */ 280 of BIOS variables. */
256 281 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
257 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
258 real_mode_switch, sizeof (real_mode_switch)); 282 real_mode_switch, sizeof (real_mode_switch));
259 memcpy ((void *) (0x1000 - 100), code, length); 283 memcpy((void *)(0x1000 - 100), code, length);
260 284
261 /* Set up the IDT for real mode. */ 285 /* Set up the IDT for real mode. */
262
263 load_idt(&real_mode_idt); 286 load_idt(&real_mode_idt);
264 287
265 /* Set up a GDT from which we can load segment descriptors for real 288 /* Set up a GDT from which we can load segment descriptors for real
266 mode. The GDT is not used in real mode; it is just needed here to 289 mode. The GDT is not used in real mode; it is just needed here to
267 prepare the descriptors. */ 290 prepare the descriptors. */
268
269 load_gdt(&real_mode_gdt); 291 load_gdt(&real_mode_gdt);
270 292
271 /* Load the data segment registers, and thus the descriptors ready for 293 /* Load the data segment registers, and thus the descriptors ready for
@@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length)
273 selector value being loaded here. This is so that the segment 295 selector value being loaded here. This is so that the segment
274 registers don't have to be reloaded after switching to real mode: 296 registers don't have to be reloaded after switching to real mode:
275 the values are consistent for real mode operation already. */ 297 the values are consistent for real mode operation already. */
276
277 __asm__ __volatile__ ("movl $0x0010,%%eax\n" 298 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
278 "\tmovl %%eax,%%ds\n" 299 "\tmovl %%eax,%%ds\n"
279 "\tmovl %%eax,%%es\n" 300 "\tmovl %%eax,%%es\n"
@@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length)
284 /* Jump to the 16-bit code that we copied earlier. It disables paging 305 /* Jump to the 16-bit code that we copied earlier. It disables paging
285 and the cache, switches to real mode, and jumps to the BIOS reset 306 and the cache, switches to real mode, and jumps to the BIOS reset
286 entry point. */ 307 entry point. */
287
288 __asm__ __volatile__ ("ljmp $0x0008,%0" 308 __asm__ __volatile__ ("ljmp $0x0008,%0"
289 : 309 :
290 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); 310 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
291} 311}
292#ifdef CONFIG_APM_MODULE 312#ifdef CONFIG_APM_MODULE
293EXPORT_SYMBOL(machine_real_restart); 313EXPORT_SYMBOL(machine_real_restart);
294#endif 314#endif
295 315
296static void native_machine_shutdown(void) 316#endif /* CONFIG_X86_32 */
317
318static inline void kb_wait(void)
319{
320 int i;
321
322 for (i = 0; i < 0x10000; i++) {
323 if ((inb(0x64) & 0x02) == 0)
324 break;
325 udelay(2);
326 }
327}
328
329void machine_emergency_restart(void)
330{
331 int i;
332
333 /* Tell the BIOS if we want cold or warm reboot */
334 *((unsigned short *)__va(0x472)) = reboot_mode;
335
336 for (;;) {
337 /* Could also try the reset bit in the Hammer NB */
338 switch (reboot_type) {
339 case BOOT_KBD:
340 for (i = 0; i < 10; i++) {
341 kb_wait();
342 udelay(50);
343 outb(0xfe, 0x64); /* pulse reset low */
344 udelay(50);
345 }
346
347 case BOOT_TRIPLE:
348 load_idt((const struct desc_ptr *)&no_idt);
349 __asm__ __volatile__("int3");
350
351 reboot_type = BOOT_KBD;
352 break;
353
354#ifdef CONFIG_X86_32
355 case BOOT_BIOS:
356 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
357
358 reboot_type = BOOT_KBD;
359 break;
360#endif
361
362 case BOOT_ACPI:
363 acpi_reboot();
364 reboot_type = BOOT_KBD;
365 break;
366
367
368 case BOOT_EFI:
369 if (efi_enabled)
370 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
371 EFI_SUCCESS, 0, NULL);
372
373 reboot_type = BOOT_KBD;
374 break;
375 }
376 }
377}
378
379void machine_shutdown(void)
297{ 380{
381 /* Stop the cpus and apics */
298#ifdef CONFIG_SMP 382#ifdef CONFIG_SMP
299 int reboot_cpu_id; 383 int reboot_cpu_id;
300 384
301 /* The boot cpu is always logical cpu 0 */ 385 /* The boot cpu is always logical cpu 0 */
302 reboot_cpu_id = 0; 386 reboot_cpu_id = 0;
303 387
388#ifdef CONFIG_X86_32
304 /* See if there has been given a command line override */ 389 /* See if there has been given a command line override */
305 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 390 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
306 cpu_isset(reboot_cpu, cpu_online_map)) { 391 cpu_isset(reboot_cpu, cpu_online_map))
307 reboot_cpu_id = reboot_cpu; 392 reboot_cpu_id = reboot_cpu;
308 } 393#endif
309 394
310 /* Make certain the cpu I'm rebooting on is online */ 395 /* Make certain the cpu I'm about to reboot on is online */
311 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { 396 if (!cpu_isset(reboot_cpu_id, cpu_online_map))
312 reboot_cpu_id = smp_processor_id(); 397 reboot_cpu_id = smp_processor_id();
313 }
314 398
315 /* Make certain I only run on the appropriate processor */ 399 /* Make certain I only run on the appropriate processor */
316 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); 400 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
317 401
318 /* O.K. Now that I'm on the appropriate processor, stop 402 /* O.K Now that I'm on the appropriate processor,
319 * all of the others, and disable their local APICs. 403 * stop all of the others.
320 */ 404 */
321
322 smp_send_stop(); 405 smp_send_stop();
323#endif /* CONFIG_SMP */ 406#endif
324 407
325 lapic_shutdown(); 408 lapic_shutdown();
326 409
327#ifdef CONFIG_X86_IO_APIC 410#ifdef CONFIG_X86_IO_APIC
328 disable_IO_APIC(); 411 disable_IO_APIC();
329#endif 412#endif
413
330#ifdef CONFIG_HPET_TIMER 414#ifdef CONFIG_HPET_TIMER
331 hpet_disable(); 415 hpet_disable();
332#endif 416#endif
333}
334 417
335void __attribute__((weak)) mach_reboot_fixups(void) 418#ifdef CONFIG_X86_64
336{ 419 pci_iommu_shutdown();
420#endif
337} 421}
338 422
339static void native_machine_emergency_restart(void) 423void machine_restart(char *__unused)
340{ 424{
341 if (!reboot_thru_bios) { 425 printk("machine restart\n");
342 if (efi_enabled) {
343 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
344 load_idt(&no_idt);
345 __asm__ __volatile__("int3");
346 }
347 /* rebooting needs to touch the page at absolute addr 0 */
348 *((unsigned short *)__va(0x472)) = reboot_mode;
349 for (;;) {
350 mach_reboot_fixups(); /* for board specific fixups */
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 load_idt(&no_idt);
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359 426
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 427 if (!reboot_force)
361} 428 machine_shutdown();
362
363static void native_machine_restart(char * __unused)
364{
365 machine_shutdown();
366 machine_emergency_restart(); 429 machine_emergency_restart();
367} 430}
368 431
369static void native_machine_halt(void) 432void machine_halt(void)
370{ 433{
371} 434}
372 435
373static void native_machine_power_off(void) 436void machine_power_off(void)
374{ 437{
375 if (pm_power_off) { 438 if (pm_power_off) {
376 machine_shutdown(); 439 if (!reboot_force)
440 machine_shutdown();
377 pm_power_off(); 441 pm_power_off();
378 } 442 }
379} 443}
380 444
381
382struct machine_ops machine_ops = { 445struct machine_ops machine_ops = {
383 .power_off = native_machine_power_off, 446 .power_off = machine_power_off,
384 .shutdown = native_machine_shutdown, 447 .shutdown = machine_shutdown,
385 .emergency_restart = native_machine_emergency_restart, 448 .emergency_restart = machine_emergency_restart,
386 .restart = native_machine_restart, 449 .restart = machine_restart,
387 .halt = native_machine_halt, 450 .halt = machine_halt
388}; 451};
389
390void machine_power_off(void)
391{
392 machine_ops.power_off();
393}
394
395void machine_shutdown(void)
396{
397 machine_ops.shutdown();
398}
399
400void machine_emergency_restart(void)
401{
402 machine_ops.emergency_restart();
403}
404
405void machine_restart(char *cmd)
406{
407 machine_ops.restart(cmd);
408}
409
410void machine_halt(void)
411{
412 machine_ops.halt();
413}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
deleted file mode 100644
index 53620a92a8fd..000000000000
--- a/arch/x86/kernel/reboot_64.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <linux/pm.h>
10#include <linux/kdebug.h>
11#include <linux/sched.h>
12#include <asm/io.h>
13#include <asm/delay.h>
14#include <asm/desc.h>
15#include <asm/hw_irq.h>
16#include <asm/system.h>
17#include <asm/pgtable.h>
18#include <asm/tlbflush.h>
19#include <asm/apic.h>
20#include <asm/hpet.h>
21#include <asm/gart.h>
22
23/*
24 * Power off function, if any
25 */
26void (*pm_power_off)(void);
27EXPORT_SYMBOL(pm_power_off);
28
29static long no_idt[3];
30static enum {
31 BOOT_TRIPLE = 't',
32 BOOT_KBD = 'k'
33} reboot_type = BOOT_KBD;
34static int reboot_mode = 0;
35int reboot_force;
36
37/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
38 warm Don't set the cold reboot flag
39 cold Set the cold reboot flag
40 triple Force a triple fault (init)
41 kbd Use the keyboard controller. cold reset (default)
42 force Avoid anything that could hang.
43 */
44static int __init reboot_setup(char *str)
45{
46 for (;;) {
47 switch (*str) {
48 case 'w':
49 reboot_mode = 0x1234;
50 break;
51
52 case 'c':
53 reboot_mode = 0;
54 break;
55
56 case 't':
57 case 'b':
58 case 'k':
59 reboot_type = *str;
60 break;
61 case 'f':
62 reboot_force = 1;
63 break;
64 }
65 if((str = strchr(str,',')) != NULL)
66 str++;
67 else
68 break;
69 }
70 return 1;
71}
72
73__setup("reboot=", reboot_setup);
74
75static inline void kb_wait(void)
76{
77 int i;
78
79 for (i=0; i<0x10000; i++)
80 if ((inb_p(0x64) & 0x02) == 0)
81 break;
82}
83
84void machine_shutdown(void)
85{
86 unsigned long flags;
87
88 /* Stop the cpus and apics */
89#ifdef CONFIG_SMP
90 int reboot_cpu_id;
91
92 /* The boot cpu is always logical cpu 0 */
93 reboot_cpu_id = 0;
94
95 /* Make certain the cpu I'm about to reboot on is online */
96 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
97 reboot_cpu_id = smp_processor_id();
98 }
99
100 /* Make certain I only run on the appropriate processor */
101 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
102
103 /* O.K Now that I'm on the appropriate processor,
104 * stop all of the others.
105 */
106 smp_send_stop();
107#endif
108
109 local_irq_save(flags);
110
111#ifndef CONFIG_SMP
112 disable_local_APIC();
113#endif
114
115 disable_IO_APIC();
116
117#ifdef CONFIG_HPET_TIMER
118 hpet_disable();
119#endif
120 local_irq_restore(flags);
121
122 pci_iommu_shutdown();
123}
124
125void machine_emergency_restart(void)
126{
127 int i;
128
129 /* Tell the BIOS if we want cold or warm reboot */
130 *((unsigned short *)__va(0x472)) = reboot_mode;
131
132 for (;;) {
133 /* Could also try the reset bit in the Hammer NB */
134 switch (reboot_type) {
135 case BOOT_KBD:
136 for (i=0; i<10; i++) {
137 kb_wait();
138 udelay(50);
139 outb(0xfe,0x64); /* pulse reset low */
140 udelay(50);
141 }
142
143 case BOOT_TRIPLE:
144 load_idt((const struct desc_ptr *)&no_idt);
145 __asm__ __volatile__("int3");
146
147 reboot_type = BOOT_KBD;
148 break;
149 }
150 }
151}
152
153void machine_restart(char * __unused)
154{
155 printk("machine restart\n");
156
157 if (!reboot_force) {
158 machine_shutdown();
159 }
160 machine_emergency_restart();
161}
162
163void machine_halt(void)
164{
165}
166
167void machine_power_off(void)
168{
169 if (pm_power_off) {
170 if (!reboot_force) {
171 machine_shutdown();
172 }
173 pm_power_off();
174 }
175}
176
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index f452726c0fe2..dec0b5ec25c2 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev)
30 udelay(50); /* shouldn't get here but be safe and spin a while */ 30 udelay(50); /* shouldn't get here but be safe and spin a while */
31} 31}
32 32
33static void rdc321x_reset(struct pci_dev *dev)
34{
35 unsigned i;
36 /* Voluntary reset the watchdog timer */
37 outl(0x80003840, 0xCF8);
38 /* Generate a CPU reset on next tick */
39 i = inl(0xCFC);
40 /* Use the minimum timer resolution */
41 i |= 0x1600;
42 outl(i, 0xCFC);
43 outb(1, 0x92);
44}
45
33struct device_fixup { 46struct device_fixup {
34 unsigned int vendor; 47 unsigned int vendor;
35 unsigned int device; 48 unsigned int device;
@@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = {
40{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
41{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
42{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
43}; 57};
44 58
45/* 59/*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 000000000000..eb9b1a198f5e
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,204 @@
1/*
2 * RTC related functions
3 */
4#include <linux/acpi.h>
5#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7
8#include <asm/time.h>
9#include <asm/vsyscall.h>
10
11#ifdef CONFIG_X86_32
12# define CMOS_YEARS_OFFS 1900
13/*
14 * This is a special lock that is owned by the CPU and holds the index
15 * register we are working with. It is required for NMI access to the
16 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
17 */
18volatile unsigned long cmos_lock = 0;
19EXPORT_SYMBOL(cmos_lock);
20#else
21/*
22 * x86-64 systems only exists since 2002.
23 * This will work up to Dec 31, 2100
24 */
25# define CMOS_YEARS_OFFS 2000
26#endif
27
28DEFINE_SPINLOCK(rtc_lock);
29EXPORT_SYMBOL(rtc_lock);
30
31/*
32 * In order to set the CMOS clock precisely, set_rtc_mmss has to be
33 * called 500 ms after the second nowtime has started, because when
34 * nowtime is written into the registers of the CMOS clock, it will
35 * jump to the next second precisely 500 ms later. Check the Motorola
36 * MC146818A or Dallas DS12887 data sheet for details.
37 *
38 * BUG: This routine does not handle hour overflow properly; it just
39 * sets the minutes. Usually you'll only notice that after reboot!
40 */
41int mach_set_rtc_mmss(unsigned long nowtime)
42{
43 int retval = 0;
44 int real_seconds, real_minutes, cmos_minutes;
45 unsigned char save_control, save_freq_select;
46
47 /* tell the clock it's being set */
48 save_control = CMOS_READ(RTC_CONTROL);
49 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
50
51 /* stop and reset prescaler */
52 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
53 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
54
55 cmos_minutes = CMOS_READ(RTC_MINUTES);
56 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
57 BCD_TO_BIN(cmos_minutes);
58
59 /*
60 * since we're only adjusting minutes and seconds,
61 * don't interfere with hour overflow. This avoids
62 * messing with unknown time zones but requires your
63 * RTC not to be off by more than 15 minutes
64 */
65 real_seconds = nowtime % 60;
66 real_minutes = nowtime / 60;
67 /* correct for half hour time zone */
68 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
69 real_minutes += 30;
70 real_minutes %= 60;
71
72 if (abs(real_minutes - cmos_minutes) < 30) {
73 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
74 BIN_TO_BCD(real_seconds);
75 BIN_TO_BCD(real_minutes);
76 }
77 CMOS_WRITE(real_seconds,RTC_SECONDS);
78 CMOS_WRITE(real_minutes,RTC_MINUTES);
79 } else {
80 printk(KERN_WARNING
81 "set_rtc_mmss: can't update from %d to %d\n",
82 cmos_minutes, real_minutes);
83 retval = -1;
84 }
85
86 /* The following flags have to be released exactly in this order,
87 * otherwise the DS12887 (popular MC146818A clone with integrated
88 * battery and quartz) will not reset the oscillator and will not
89 * update precisely 500 ms later. You won't find this mentioned in
90 * the Dallas Semiconductor data sheets, but who believes data
91 * sheets anyway ... -- Markus Kuhn
92 */
93 CMOS_WRITE(save_control, RTC_CONTROL);
94 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
95
96 return retval;
97}
98
99unsigned long mach_get_cmos_time(void)
100{
101 unsigned int year, mon, day, hour, min, sec, century = 0;
102
103 /*
104 * If UIP is clear, then we have >= 244 microseconds before
105 * RTC registers will be updated. Spec sheet says that this
106 * is the reliable way to read RTC - registers. If UIP is set
107 * then the register access might be invalid.
108 */
109 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
110 cpu_relax();
111
112 sec = CMOS_READ(RTC_SECONDS);
113 min = CMOS_READ(RTC_MINUTES);
114 hour = CMOS_READ(RTC_HOURS);
115 day = CMOS_READ(RTC_DAY_OF_MONTH);
116 mon = CMOS_READ(RTC_MONTH);
117 year = CMOS_READ(RTC_YEAR);
118
119#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
120 /* CHECKME: Is this really 64bit only ??? */
121 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
122 acpi_gbl_FADT.century)
123 century = CMOS_READ(acpi_gbl_FADT.century);
124#endif
125
126 if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec);
128 BCD_TO_BIN(min);
129 BCD_TO_BIN(hour);
130 BCD_TO_BIN(day);
131 BCD_TO_BIN(mon);
132 BCD_TO_BIN(year);
133 }
134
135 if (century) {
136 BCD_TO_BIN(century);
137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else {
140 year += CMOS_YEARS_OFFS;
141 if (year < 1970)
142 year += 100;
143 }
144
145 return mktime(year, mon, day, hour, min, sec);
146}
147
148/* Routines for accessing the CMOS RAM/RTC. */
149unsigned char rtc_cmos_read(unsigned char addr)
150{
151 unsigned char val;
152
153 lock_cmos_prefix(addr);
154 outb_p(addr, RTC_PORT(0));
155 val = inb_p(RTC_PORT(1));
156 lock_cmos_suffix(addr);
157 return val;
158}
159EXPORT_SYMBOL(rtc_cmos_read);
160
161void rtc_cmos_write(unsigned char val, unsigned char addr)
162{
163 lock_cmos_prefix(addr);
164 outb_p(addr, RTC_PORT(0));
165 outb_p(val, RTC_PORT(1));
166 lock_cmos_suffix(addr);
167}
168EXPORT_SYMBOL(rtc_cmos_write);
169
170static int set_rtc_mmss(unsigned long nowtime)
171{
172 int retval;
173 unsigned long flags;
174
175 spin_lock_irqsave(&rtc_lock, flags);
176 retval = set_wallclock(nowtime);
177 spin_unlock_irqrestore(&rtc_lock, flags);
178
179 return retval;
180}
181
182/* not static: needed by APM */
183unsigned long read_persistent_clock(void)
184{
185 unsigned long retval, flags;
186
187 spin_lock_irqsave(&rtc_lock, flags);
188 retval = get_wallclock();
189 spin_unlock_irqrestore(&rtc_lock, flags);
190
191 return retval;
192}
193
194int update_persistent_clock(struct timespec now)
195{
196 return set_rtc_mmss(now.tv_sec);
197}
198
199unsigned long long native_read_tsc(void)
200{
201 return __native_read_tsc();
202}
203EXPORT_SYMBOL(native_read_tsc);
204
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3558ac78c926..309366f8f603 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -24,7 +24,11 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/setup.h> 25#include <asm/setup.h>
26 26
27#ifndef CONFIG_DEBUG_BOOT_PARAMS
27struct boot_params __initdata boot_params; 28struct boot_params __initdata boot_params;
29#else
30struct boot_params boot_params;
31#endif
28 32
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 33cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30 34
@@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 41char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 42
39unsigned long __supported_pte_mask __read_mostly = ~0UL; 43unsigned long __supported_pte_mask __read_mostly = ~0UL;
44EXPORT_SYMBOL_GPL(__supported_pte_mask);
45
40static int do_not_nx __cpuinitdata = 0; 46static int do_not_nx __cpuinitdata = 0;
41 47
42/* noexec=on|off 48/* noexec=on|off
@@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str)
80__setup("noexec32=", nonx32_setup); 86__setup("noexec32=", nonx32_setup);
81 87
82/* 88/*
89 * Copy data used in early init routines from the initial arrays to the
90 * per cpu data areas. These arrays then become expendable and the
91 * *_early_ptr's are zeroed indicating that the static arrays are gone.
92 */
93static void __init setup_per_cpu_maps(void)
94{
95 int cpu;
96
97 for_each_possible_cpu(cpu) {
98#ifdef CONFIG_SMP
99 if (per_cpu_offset(cpu)) {
100#endif
101 per_cpu(x86_cpu_to_apicid, cpu) =
102 x86_cpu_to_apicid_init[cpu];
103 per_cpu(x86_bios_cpu_apicid, cpu) =
104 x86_bios_cpu_apicid_init[cpu];
105#ifdef CONFIG_NUMA
106 per_cpu(x86_cpu_to_node_map, cpu) =
107 x86_cpu_to_node_map_init[cpu];
108#endif
109#ifdef CONFIG_SMP
110 }
111 else
112 printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
113 cpu);
114#endif
115 }
116
117 /* indicate the early static arrays will soon be gone */
118 x86_cpu_to_apicid_early_ptr = NULL;
119 x86_bios_cpu_apicid_early_ptr = NULL;
120#ifdef CONFIG_NUMA
121 x86_cpu_to_node_map_early_ptr = NULL;
122#endif
123}
124
125/*
83 * Great future plan: 126 * Great future plan:
84 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 127 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
85 * Always point %gs to its beginning 128 * Always point %gs to its beginning
@@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void)
100 for_each_cpu_mask (i, cpu_possible_map) { 143 for_each_cpu_mask (i, cpu_possible_map) {
101 char *ptr; 144 char *ptr;
102 145
103 if (!NODE_DATA(cpu_to_node(i))) { 146 if (!NODE_DATA(early_cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n", 147 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes()); 148 i, num_online_nodes());
106 ptr = alloc_bootmem_pages(size); 149 ptr = alloc_bootmem_pages(size);
107 } else { 150 } else {
108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); 151 ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
109 } 152 }
110 if (!ptr) 153 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i); 154 panic("Cannot allocate cpu data for CPU %d\n", i);
112 cpu_pda(i)->data_offset = ptr - __per_cpu_start; 155 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
113 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 156 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
114 } 157 }
158
159 /* setup percpu data maps early */
160 setup_per_cpu_maps();
115} 161}
116 162
117void pda_init(int cpu) 163void pda_init(int cpu)
@@ -169,7 +215,8 @@ void syscall_init(void)
169#endif 215#endif
170 216
171 /* Flags to clear on syscall */ 217 /* Flags to clear on syscall */
172 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 218 wrmsrl(MSR_SYSCALL_MASK,
219 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
173} 220}
174 221
175void __cpuinit check_efer(void) 222void __cpuinit check_efer(void)
@@ -227,7 +274,7 @@ void __cpuinit cpu_init (void)
227 * and set up the GDT descriptor: 274 * and set up the GDT descriptor:
228 */ 275 */
229 if (cpu) 276 if (cpu)
230 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); 277 memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
231 278
232 cpu_gdt_descr[cpu].size = GDT_SIZE; 279 cpu_gdt_descr[cpu].size = GDT_SIZE;
233 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); 280 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
@@ -257,10 +304,10 @@ void __cpuinit cpu_init (void)
257 v, cpu); 304 v, cpu);
258 } 305 }
259 estacks += PAGE_SIZE << order[v]; 306 estacks += PAGE_SIZE << order[v];
260 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; 307 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
261 } 308 }
262 309
263 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 310 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
264 /* 311 /*
265 * <= is required because the CPU will access up to 312 * <= is required because the CPU will access up to
266 * 8 bits beyond the end of the IO permission bitmap. 313 * 8 bits beyond the end of the IO permission bitmap.
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9c24b45b513c..62adc5f20be5 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -44,9 +44,12 @@
44#include <linux/crash_dump.h> 44#include <linux/crash_dump.h>
45#include <linux/dmi.h> 45#include <linux/dmi.h>
46#include <linux/pfn.h> 46#include <linux/pfn.h>
47#include <linux/pci.h>
48#include <linux/init_ohci1394_dma.h>
47 49
48#include <video/edid.h> 50#include <video/edid.h>
49 51
52#include <asm/mtrr.h>
50#include <asm/apic.h> 53#include <asm/apic.h>
51#include <asm/e820.h> 54#include <asm/e820.h>
52#include <asm/mpspec.h> 55#include <asm/mpspec.h>
@@ -67,14 +70,83 @@
67 address, and must not be in the .bss segment! */ 70 address, and must not be in the .bss segment! */
68unsigned long init_pg_tables_end __initdata = ~0UL; 71unsigned long init_pg_tables_end __initdata = ~0UL;
69 72
70int disable_pse __cpuinitdata = 0;
71
72/* 73/*
73 * Machine setup.. 74 * Machine setup..
74 */ 75 */
75extern struct resource code_resource; 76static struct resource data_resource = {
76extern struct resource data_resource; 77 .name = "Kernel data",
77extern struct resource bss_resource; 78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
81};
82
83static struct resource code_resource = {
84 .name = "Kernel code",
85 .start = 0,
86 .end = 0,
87 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
88};
89
90static struct resource bss_resource = {
91 .name = "Kernel bss",
92 .start = 0,
93 .end = 0,
94 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
95};
96
97static struct resource video_ram_resource = {
98 .name = "Video RAM area",
99 .start = 0xa0000,
100 .end = 0xbffff,
101 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
102};
103
104static struct resource standard_io_resources[] = { {
105 .name = "dma1",
106 .start = 0x0000,
107 .end = 0x001f,
108 .flags = IORESOURCE_BUSY | IORESOURCE_IO
109}, {
110 .name = "pic1",
111 .start = 0x0020,
112 .end = 0x0021,
113 .flags = IORESOURCE_BUSY | IORESOURCE_IO
114}, {
115 .name = "timer0",
116 .start = 0x0040,
117 .end = 0x0043,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO
119}, {
120 .name = "timer1",
121 .start = 0x0050,
122 .end = 0x0053,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO
124}, {
125 .name = "keyboard",
126 .start = 0x0060,
127 .end = 0x006f,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO
129}, {
130 .name = "dma page reg",
131 .start = 0x0080,
132 .end = 0x008f,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO
134}, {
135 .name = "pic2",
136 .start = 0x00a0,
137 .end = 0x00a1,
138 .flags = IORESOURCE_BUSY | IORESOURCE_IO
139}, {
140 .name = "dma2",
141 .start = 0x00c0,
142 .end = 0x00df,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO
144}, {
145 .name = "fpu",
146 .start = 0x00f0,
147 .end = 0x00ff,
148 .flags = IORESOURCE_BUSY | IORESOURCE_IO
149} };
78 150
79/* cpu data as detected by the assembly code in head.S */ 151/* cpu data as detected by the assembly code in head.S */
80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 152struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -116,13 +188,17 @@ extern int root_mountflags;
116 188
117unsigned long saved_videomode; 189unsigned long saved_videomode;
118 190
119#define RAMDISK_IMAGE_START_MASK 0x07FF 191#define RAMDISK_IMAGE_START_MASK 0x07FF
120#define RAMDISK_PROMPT_FLAG 0x8000 192#define RAMDISK_PROMPT_FLAG 0x8000
121#define RAMDISK_LOAD_FLAG 0x4000 193#define RAMDISK_LOAD_FLAG 0x4000
122 194
123static char __initdata command_line[COMMAND_LINE_SIZE]; 195static char __initdata command_line[COMMAND_LINE_SIZE];
124 196
197#ifndef CONFIG_DEBUG_BOOT_PARAMS
125struct boot_params __initdata boot_params; 198struct boot_params __initdata boot_params;
199#else
200struct boot_params boot_params;
201#endif
126 202
127#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 203#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
128struct edd edd; 204struct edd edd;
@@ -166,8 +242,7 @@ static int __init parse_mem(char *arg)
166 return -EINVAL; 242 return -EINVAL;
167 243
168 if (strcmp(arg, "nopentium") == 0) { 244 if (strcmp(arg, "nopentium") == 0) {
169 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 245 setup_clear_cpu_cap(X86_FEATURE_PSE);
170 disable_pse = 1;
171 } else { 246 } else {
172 /* If the user specifies memory size, we 247 /* If the user specifies memory size, we
173 * limit the BIOS-provided memory map to 248 * limit the BIOS-provided memory map to
@@ -176,7 +251,7 @@ static int __init parse_mem(char *arg)
176 * trim the existing memory map. 251 * trim the existing memory map.
177 */ 252 */
178 unsigned long long mem_size; 253 unsigned long long mem_size;
179 254
180 mem_size = memparse(arg, &arg); 255 mem_size = memparse(arg, &arg);
181 limit_regions(mem_size); 256 limit_regions(mem_size);
182 user_defined_memmap = 1; 257 user_defined_memmap = 1;
@@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void)
315 unsigned int addr; 390 unsigned int addr;
316 addr = get_bios_ebda(); 391 addr = get_bios_ebda();
317 if (addr) 392 if (addr)
318 reserve_bootmem(addr, PAGE_SIZE); 393 reserve_bootmem(addr, PAGE_SIZE);
319} 394}
320 395
321#ifndef CONFIG_NEED_MULTIPLE_NODES 396#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void)
420{} 495{}
421#endif 496#endif
422 497
498#ifdef CONFIG_BLK_DEV_INITRD
499
500static bool do_relocate_initrd = false;
501
502static void __init reserve_initrd(void)
503{
504 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
505 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
506 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
507 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
508 unsigned long ramdisk_here;
509
510 initrd_start = 0;
511
512 if (!boot_params.hdr.type_of_loader ||
513 !ramdisk_image || !ramdisk_size)
514 return; /* No initrd provided by bootloader */
515
516 if (ramdisk_end < ramdisk_image) {
517 printk(KERN_ERR "initrd wraps around end of memory, "
518 "disabling initrd\n");
519 return;
520 }
521 if (ramdisk_size >= end_of_lowmem/2) {
522 printk(KERN_ERR "initrd too large to handle, "
523 "disabling initrd\n");
524 return;
525 }
526 if (ramdisk_end <= end_of_lowmem) {
527 /* All in lowmem, easy case */
528 reserve_bootmem(ramdisk_image, ramdisk_size);
529 initrd_start = ramdisk_image + PAGE_OFFSET;
530 initrd_end = initrd_start+ramdisk_size;
531 return;
532 }
533
534 /* We need to move the initrd down into lowmem */
535 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
536
537 /* Note: this includes all the lowmem currently occupied by
538 the initrd, we rely on that fact to keep the data intact. */
539 reserve_bootmem(ramdisk_here, ramdisk_size);
540 initrd_start = ramdisk_here + PAGE_OFFSET;
541 initrd_end = initrd_start + ramdisk_size;
542
543 do_relocate_initrd = true;
544}
545
546#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
547
548static void __init relocate_initrd(void)
549{
550 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
551 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
552 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
553 unsigned long ramdisk_here;
554 unsigned long slop, clen, mapaddr;
555 char *p, *q;
556
557 if (!do_relocate_initrd)
558 return;
559
560 ramdisk_here = initrd_start - PAGE_OFFSET;
561
562 q = (char *)initrd_start;
563
564 /* Copy any lowmem portion of the initrd */
565 if (ramdisk_image < end_of_lowmem) {
566 clen = end_of_lowmem - ramdisk_image;
567 p = (char *)__va(ramdisk_image);
568 memcpy(q, p, clen);
569 q += clen;
570 ramdisk_image += clen;
571 ramdisk_size -= clen;
572 }
573
574 /* Copy the highmem portion of the initrd */
575 while (ramdisk_size) {
576 slop = ramdisk_image & ~PAGE_MASK;
577 clen = ramdisk_size;
578 if (clen > MAX_MAP_CHUNK-slop)
579 clen = MAX_MAP_CHUNK-slop;
580 mapaddr = ramdisk_image & PAGE_MASK;
581 p = early_ioremap(mapaddr, clen+slop);
582 memcpy(q, p+slop, clen);
583 early_iounmap(p, clen+slop);
584 q += clen;
585 ramdisk_image += clen;
586 ramdisk_size -= clen;
587 }
588}
589
590#endif /* CONFIG_BLK_DEV_INITRD */
591
423void __init setup_bootmem_allocator(void) 592void __init setup_bootmem_allocator(void)
424{ 593{
425 unsigned long bootmap_size; 594 unsigned long bootmap_size;
@@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void)
475 */ 644 */
476 find_smp_config(); 645 find_smp_config();
477#endif 646#endif
478 numa_kva_reserve();
479#ifdef CONFIG_BLK_DEV_INITRD 647#ifdef CONFIG_BLK_DEV_INITRD
480 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 648 reserve_initrd();
481 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
482 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
483 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
484 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
485
486 if (ramdisk_end <= end_of_lowmem) {
487 reserve_bootmem(ramdisk_image, ramdisk_size);
488 initrd_start = ramdisk_image + PAGE_OFFSET;
489 initrd_end = initrd_start+ramdisk_size;
490 } else {
491 printk(KERN_ERR "initrd extends beyond end of memory "
492 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
493 ramdisk_end, end_of_lowmem);
494 initrd_start = 0;
495 }
496 }
497#endif 649#endif
650 numa_kva_reserve();
498 reserve_crashkernel(); 651 reserve_crashkernel();
499} 652}
500 653
@@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p)
545 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 698 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
546 pre_setup_arch_hook(); 699 pre_setup_arch_hook();
547 early_cpu_init(); 700 early_cpu_init();
701 early_ioremap_init();
548 702
549 /*
550 * FIXME: This isn't an official loader_type right
551 * now but does currently work with elilo.
552 * If we were configured as an EFI kernel, check to make
553 * sure that we were loaded correctly from elilo and that
554 * the system table is valid. If not, then initialize normally.
555 */
556#ifdef CONFIG_EFI 703#ifdef CONFIG_EFI
557 if ((boot_params.hdr.type_of_loader == 0x50) && 704 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
558 boot_params.efi_info.efi_systab) 705 "EL32", 4))
559 efi_enabled = 1; 706 efi_enabled = 1;
560#endif 707#endif
561 708
@@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p)
579 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 726 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
580#endif 727#endif
581 ARCH_SETUP 728 ARCH_SETUP
582 if (efi_enabled) 729
583 efi_init(); 730 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
584 else { 731 print_memory_map(memory_setup());
585 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
586 print_memory_map(memory_setup());
587 }
588 732
589 copy_edd(); 733 copy_edd();
590 734
@@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p)
612 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 756 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
613 *cmdline_p = command_line; 757 *cmdline_p = command_line;
614 758
759 if (efi_enabled)
760 efi_init();
761
615 max_low_pfn = setup_memory(); 762 max_low_pfn = setup_memory();
616 763
764 /* update e820 for memory not covered by WB MTRRs */
765 mtrr_bp_init();
766 if (mtrr_trim_uncached_memory(max_pfn))
767 max_low_pfn = setup_memory();
768
617#ifdef CONFIG_VMI 769#ifdef CONFIG_VMI
618 /* 770 /*
619 * Must be after max_low_pfn is determined, and before kernel 771 * Must be after max_low_pfn is determined, and before kernel
@@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p)
636 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ 788 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
637#endif 789#endif
638 paging_init(); 790 paging_init();
791
792 /*
793 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
794 */
795
796#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
797 if (init_ohci1394_dma_early)
798 init_ohci1394_dma_on_all_controllers();
799#endif
800
639 remapped_pgdat_init(); 801 remapped_pgdat_init();
640 sparse_init(); 802 sparse_init();
641 zone_sizes_init(); 803 zone_sizes_init();
@@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p)
644 * NOTE: at this point the bootmem allocator is fully available. 806 * NOTE: at this point the bootmem allocator is fully available.
645 */ 807 */
646 808
809#ifdef CONFIG_BLK_DEV_INITRD
810 relocate_initrd();
811#endif
812
647 paravirt_post_allocator_init(); 813 paravirt_post_allocator_init();
648 814
649 dmi_scan_machine(); 815 dmi_scan_machine();
650 816
817 io_delay_init();
818
651#ifdef CONFIG_X86_GENERICARCH 819#ifdef CONFIG_X86_GENERICARCH
652 generic_apic_probe(); 820 generic_apic_probe();
653#endif 821#endif
654 if (efi_enabled)
655 efi_map_memmap();
656 822
657#ifdef CONFIG_ACPI 823#ifdef CONFIG_ACPI
658 /* 824 /*
@@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p)
661 acpi_boot_table_init(); 827 acpi_boot_table_init();
662#endif 828#endif
663 829
664#ifdef CONFIG_PCI
665 early_quirks(); 830 early_quirks();
666#endif
667 831
668#ifdef CONFIG_ACPI 832#ifdef CONFIG_ACPI
669 acpi_boot_init(); 833 acpi_boot_init();
@@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p)
692#endif 856#endif
693#endif 857#endif
694} 858}
859
860/*
861 * Request address space for all standard resources
862 *
863 * This is called just before pcibios_init(), which is also a
864 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
865 */
866static int __init request_standard_resources(void)
867{
868 int i;
869
870 printk(KERN_INFO "Setting up standard PCI resources\n");
871 init_iomem_resources(&code_resource, &data_resource, &bss_resource);
872
873 request_resource(&iomem_resource, &video_ram_resource);
874
875 /* request I/O space for devices used on all i[345]86 PCs */
876 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
877 request_resource(&ioport_resource, &standard_io_resources[i]);
878 return 0;
879}
880
881subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1d5f5f..77fb87bf6e5a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -30,6 +30,7 @@
30#include <linux/crash_dump.h> 30#include <linux/crash_dump.h>
31#include <linux/root_dev.h> 31#include <linux/root_dev.h>
32#include <linux/pci.h> 32#include <linux/pci.h>
33#include <linux/efi.h>
33#include <linux/acpi.h> 34#include <linux/acpi.h>
34#include <linux/kallsyms.h> 35#include <linux/kallsyms.h>
35#include <linux/edd.h> 36#include <linux/edd.h>
@@ -39,10 +40,13 @@
39#include <linux/dmi.h> 40#include <linux/dmi.h>
40#include <linux/dma-mapping.h> 41#include <linux/dma-mapping.h>
41#include <linux/ctype.h> 42#include <linux/ctype.h>
43#include <linux/uaccess.h>
44#include <linux/init_ohci1394_dma.h>
42 45
43#include <asm/mtrr.h> 46#include <asm/mtrr.h>
44#include <asm/uaccess.h> 47#include <asm/uaccess.h>
45#include <asm/system.h> 48#include <asm/system.h>
49#include <asm/vsyscall.h>
46#include <asm/io.h> 50#include <asm/io.h>
47#include <asm/smp.h> 51#include <asm/smp.h>
48#include <asm/msr.h> 52#include <asm/msr.h>
@@ -50,6 +54,7 @@
50#include <video/edid.h> 54#include <video/edid.h>
51#include <asm/e820.h> 55#include <asm/e820.h>
52#include <asm/dma.h> 56#include <asm/dma.h>
57#include <asm/gart.h>
53#include <asm/mpspec.h> 58#include <asm/mpspec.h>
54#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
55#include <asm/proto.h> 60#include <asm/proto.h>
@@ -59,6 +64,15 @@
59#include <asm/sections.h> 64#include <asm/sections.h>
60#include <asm/dmi.h> 65#include <asm/dmi.h>
61#include <asm/cacheflush.h> 66#include <asm/cacheflush.h>
67#include <asm/mce.h>
68#include <asm/ds.h>
69#include <asm/topology.h>
70
71#ifdef CONFIG_PARAVIRT
72#include <asm/paravirt.h>
73#else
74#define ARCH_SETUP
75#endif
62 76
63/* 77/*
64 * Machine setup.. 78 * Machine setup..
@@ -67,6 +81,8 @@
67struct cpuinfo_x86 boot_cpu_data __read_mostly; 81struct cpuinfo_x86 boot_cpu_data __read_mostly;
68EXPORT_SYMBOL(boot_cpu_data); 82EXPORT_SYMBOL(boot_cpu_data);
69 83
84__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
85
70unsigned long mmu_cr4_features; 86unsigned long mmu_cr4_features;
71 87
72/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 88/* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -76,7 +92,7 @@ unsigned long saved_video_mode;
76 92
77int force_mwait __cpuinitdata; 93int force_mwait __cpuinitdata;
78 94
79/* 95/*
80 * Early DMI memory 96 * Early DMI memory
81 */ 97 */
82int dmi_alloc_index; 98int dmi_alloc_index;
@@ -122,25 +138,27 @@ struct resource standard_io_resources[] = {
122 138
123#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) 139#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
124 140
125struct resource data_resource = { 141static struct resource data_resource = {
126 .name = "Kernel data", 142 .name = "Kernel data",
127 .start = 0, 143 .start = 0,
128 .end = 0, 144 .end = 0,
129 .flags = IORESOURCE_RAM, 145 .flags = IORESOURCE_RAM,
130}; 146};
131struct resource code_resource = { 147static struct resource code_resource = {
132 .name = "Kernel code", 148 .name = "Kernel code",
133 .start = 0, 149 .start = 0,
134 .end = 0, 150 .end = 0,
135 .flags = IORESOURCE_RAM, 151 .flags = IORESOURCE_RAM,
136}; 152};
137struct resource bss_resource = { 153static struct resource bss_resource = {
138 .name = "Kernel bss", 154 .name = "Kernel bss",
139 .start = 0, 155 .start = 0,
140 .end = 0, 156 .end = 0,
141 .flags = IORESOURCE_RAM, 157 .flags = IORESOURCE_RAM,
142}; 158};
143 159
160static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
161
144#ifdef CONFIG_PROC_VMCORE 162#ifdef CONFIG_PROC_VMCORE
145/* elfcorehdr= specifies the location of elf core header 163/* elfcorehdr= specifies the location of elf core header
146 * stored by the crashed kernel. This option will be passed 164 * stored by the crashed kernel. This option will be passed
@@ -166,12 +184,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
166 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 184 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
167 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); 185 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
168 if (bootmap == -1L) 186 if (bootmap == -1L)
169 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 187 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
170 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 188 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
171 e820_register_active_regions(0, start_pfn, end_pfn); 189 e820_register_active_regions(0, start_pfn, end_pfn);
172 free_bootmem_with_active_regions(0, end_pfn); 190 free_bootmem_with_active_regions(0, end_pfn);
173 reserve_bootmem(bootmap, bootmap_size); 191 reserve_bootmem(bootmap, bootmap_size);
174} 192}
175#endif 193#endif
176 194
177#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 195#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -205,7 +223,8 @@ static void __init reserve_crashkernel(void)
205 unsigned long long crash_size, crash_base; 223 unsigned long long crash_size, crash_base;
206 int ret; 224 int ret;
207 225
208 free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; 226 free_mem =
227 ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
209 228
210 ret = parse_crashkernel(boot_command_line, free_mem, 229 ret = parse_crashkernel(boot_command_line, free_mem,
211 &crash_size, &crash_base); 230 &crash_size, &crash_base);
@@ -229,33 +248,21 @@ static inline void __init reserve_crashkernel(void)
229{} 248{}
230#endif 249#endif
231 250
232#define EBDA_ADDR_POINTER 0x40E 251/* Overridden in paravirt.c if CONFIG_PARAVIRT */
233 252void __attribute__((weak)) __init memory_setup(void)
234unsigned __initdata ebda_addr;
235unsigned __initdata ebda_size;
236
237static void discover_ebda(void)
238{ 253{
239 /* 254 machine_specific_memory_setup();
240 * there is a real-mode segmented pointer pointing to the
241 * 4K EBDA area at 0x40E
242 */
243 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
244 ebda_addr <<= 4;
245
246 ebda_size = *(unsigned short *)__va(ebda_addr);
247
248 /* Round EBDA up to pages */
249 if (ebda_size == 0)
250 ebda_size = 1;
251 ebda_size <<= 10;
252 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
253 if (ebda_size > 64*1024)
254 ebda_size = 64*1024;
255} 255}
256 256
257/*
258 * setup_arch - architecture-specific boot-time initializations
259 *
260 * Note: On x86_64, fixmaps are ready for use even before this is called.
261 */
257void __init setup_arch(char **cmdline_p) 262void __init setup_arch(char **cmdline_p)
258{ 263{
264 unsigned i;
265
259 printk(KERN_INFO "Command line: %s\n", boot_command_line); 266 printk(KERN_INFO "Command line: %s\n", boot_command_line);
260 267
261 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 268 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -269,7 +276,15 @@ void __init setup_arch(char **cmdline_p)
269 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); 276 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
270 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 277 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
271#endif 278#endif
272 setup_memory_region(); 279#ifdef CONFIG_EFI
280 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
281 "EL64", 4))
282 efi_enabled = 1;
283#endif
284
285 ARCH_SETUP
286
287 memory_setup();
273 copy_edd(); 288 copy_edd();
274 289
275 if (!boot_params.hdr.root_flags) 290 if (!boot_params.hdr.root_flags)
@@ -293,27 +308,47 @@ void __init setup_arch(char **cmdline_p)
293 308
294 parse_early_param(); 309 parse_early_param();
295 310
311#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
312 if (init_ohci1394_dma_early)
313 init_ohci1394_dma_on_all_controllers();
314#endif
315
296 finish_e820_parsing(); 316 finish_e820_parsing();
297 317
318 early_gart_iommu_check();
319
298 e820_register_active_regions(0, 0, -1UL); 320 e820_register_active_regions(0, 0, -1UL);
299 /* 321 /*
300 * partially used pages are not usable - thus 322 * partially used pages are not usable - thus
301 * we are rounding upwards: 323 * we are rounding upwards:
302 */ 324 */
303 end_pfn = e820_end_of_ram(); 325 end_pfn = e820_end_of_ram();
326 /* update e820 for memory not covered by WB MTRRs */
327 mtrr_bp_init();
328 if (mtrr_trim_uncached_memory(end_pfn)) {
329 e820_register_active_regions(0, 0, -1UL);
330 end_pfn = e820_end_of_ram();
331 }
332
304 num_physpages = end_pfn; 333 num_physpages = end_pfn;
305 334
306 check_efer(); 335 check_efer();
307 336
308 discover_ebda();
309
310 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); 337 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
338 if (efi_enabled)
339 efi_init();
311 340
312 dmi_scan_machine(); 341 dmi_scan_machine();
313 342
343 io_delay_init();
344
314#ifdef CONFIG_SMP 345#ifdef CONFIG_SMP
315 /* setup to use the static apicid table during kernel startup */ 346 /* setup to use the early static init tables during kernel startup */
316 x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; 347 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
348 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
349#ifdef CONFIG_NUMA
350 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
351#endif
317#endif 352#endif
318 353
319#ifdef CONFIG_ACPI 354#ifdef CONFIG_ACPI
@@ -340,48 +375,26 @@ void __init setup_arch(char **cmdline_p)
340#endif 375#endif
341 376
342#ifdef CONFIG_NUMA 377#ifdef CONFIG_NUMA
343 numa_initmem_init(0, end_pfn); 378 numa_initmem_init(0, end_pfn);
344#else 379#else
345 contig_initmem_init(0, end_pfn); 380 contig_initmem_init(0, end_pfn);
346#endif 381#endif
347 382
348 /* Reserve direct mapping */ 383 early_res_to_bootmem();
349 reserve_bootmem_generic(table_start << PAGE_SHIFT,
350 (table_end - table_start) << PAGE_SHIFT);
351
352 /* reserve kernel */
353 reserve_bootmem_generic(__pa_symbol(&_text),
354 __pa_symbol(&_end) - __pa_symbol(&_text));
355 384
385#ifdef CONFIG_ACPI_SLEEP
356 /* 386 /*
357 * reserve physical page 0 - it's a special BIOS page on many boxes, 387 * Reserve low memory region for sleep support.
358 * enabling clean reboots, SMP operation, laptop functions.
359 */ 388 */
360 reserve_bootmem_generic(0, PAGE_SIZE); 389 acpi_reserve_bootmem();
361
362 /* reserve ebda region */
363 if (ebda_addr)
364 reserve_bootmem_generic(ebda_addr, ebda_size);
365#ifdef CONFIG_NUMA
366 /* reserve nodemap region */
367 if (nodemap_addr)
368 reserve_bootmem_generic(nodemap_addr, nodemap_size);
369#endif 390#endif
370 391
371#ifdef CONFIG_SMP 392 if (efi_enabled)
372 /* Reserve SMP trampoline */ 393 efi_reserve_bootmem();
373 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
374#endif
375 394
376#ifdef CONFIG_ACPI_SLEEP
377 /* 395 /*
378 * Reserve low memory region for sleep support. 396 * Find and reserve possible boot-time SMP configuration:
379 */ 397 */
380 acpi_reserve_bootmem();
381#endif
382 /*
383 * Find and reserve possible boot-time SMP configuration:
384 */
385 find_smp_config(); 398 find_smp_config();
386#ifdef CONFIG_BLK_DEV_INITRD 399#ifdef CONFIG_BLK_DEV_INITRD
387 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 400 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
@@ -395,6 +408,8 @@ void __init setup_arch(char **cmdline_p)
395 initrd_start = ramdisk_image + PAGE_OFFSET; 408 initrd_start = ramdisk_image + PAGE_OFFSET;
396 initrd_end = initrd_start+ramdisk_size; 409 initrd_end = initrd_start+ramdisk_size;
397 } else { 410 } else {
411 /* Assumes everything on node 0 */
412 free_bootmem(ramdisk_image, ramdisk_size);
398 printk(KERN_ERR "initrd extends beyond end of memory " 413 printk(KERN_ERR "initrd extends beyond end of memory "
399 "(0x%08lx > 0x%08lx)\ndisabling initrd\n", 414 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
400 ramdisk_end, end_of_mem); 415 ramdisk_end, end_of_mem);
@@ -404,17 +419,10 @@ void __init setup_arch(char **cmdline_p)
404#endif 419#endif
405 reserve_crashkernel(); 420 reserve_crashkernel();
406 paging_init(); 421 paging_init();
422 map_vsyscall();
407 423
408#ifdef CONFIG_PCI
409 early_quirks(); 424 early_quirks();
410#endif
411 425
412 /*
413 * set this early, so we dont allocate cpu0
414 * if MADT list doesnt list BSP first
415 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
416 */
417 cpu_set(0, cpu_present_map);
418#ifdef CONFIG_ACPI 426#ifdef CONFIG_ACPI
419 /* 427 /*
420 * Read APIC and some other early information from ACPI tables. 428 * Read APIC and some other early information from ACPI tables.
@@ -430,25 +438,24 @@ void __init setup_arch(char **cmdline_p)
430 if (smp_found_config) 438 if (smp_found_config)
431 get_smp_config(); 439 get_smp_config();
432 init_apic_mappings(); 440 init_apic_mappings();
441 ioapic_init_mappings();
433 442
434 /* 443 /*
435 * We trust e820 completely. No explicit ROM probing in memory. 444 * We trust e820 completely. No explicit ROM probing in memory.
436 */ 445 */
437 e820_reserve_resources(); 446 e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
438 e820_mark_nosave_regions(); 447 e820_mark_nosave_regions();
439 448
440 {
441 unsigned i;
442 /* request I/O space for devices used on all i[345]86 PCs */ 449 /* request I/O space for devices used on all i[345]86 PCs */
443 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) 450 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
444 request_resource(&ioport_resource, &standard_io_resources[i]); 451 request_resource(&ioport_resource, &standard_io_resources[i]);
445 }
446 452
447 e820_setup_gap(); 453 e820_setup_gap();
448 454
449#ifdef CONFIG_VT 455#ifdef CONFIG_VT
450#if defined(CONFIG_VGA_CONSOLE) 456#if defined(CONFIG_VGA_CONSOLE)
451 conswitchp = &vga_con; 457 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
458 conswitchp = &vga_con;
452#elif defined(CONFIG_DUMMY_CONSOLE) 459#elif defined(CONFIG_DUMMY_CONSOLE)
453 conswitchp = &dummy_con; 460 conswitchp = &dummy_con;
454#endif 461#endif
@@ -479,9 +486,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
479 486
480 if (n >= 0x80000005) { 487 if (n >= 0x80000005) {
481 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 488 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
482 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 489 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
483 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 490 "D cache %dK (%d bytes/line)\n",
484 c->x86_cache_size=(ecx>>24)+(edx>>24); 491 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
492 c->x86_cache_size = (ecx>>24) + (edx>>24);
485 /* On K8 L1 TLB is inclusive, so don't count it */ 493 /* On K8 L1 TLB is inclusive, so don't count it */
486 c->x86_tlbsize = 0; 494 c->x86_tlbsize = 0;
487 } 495 }
@@ -495,11 +503,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
495 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 503 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
496 c->x86_cache_size, ecx & 0xFF); 504 c->x86_cache_size, ecx & 0xFF);
497 } 505 }
498
499 if (n >= 0x80000007)
500 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
501 if (n >= 0x80000008) { 506 if (n >= 0x80000008) {
502 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 507 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
503 c->x86_virt_bits = (eax >> 8) & 0xff; 508 c->x86_virt_bits = (eax >> 8) & 0xff;
504 c->x86_phys_bits = eax & 0xff; 509 c->x86_phys_bits = eax & 0xff;
505 } 510 }
@@ -508,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
508#ifdef CONFIG_NUMA 513#ifdef CONFIG_NUMA
509static int nearby_node(int apicid) 514static int nearby_node(int apicid)
510{ 515{
511 int i; 516 int i, node;
517
512 for (i = apicid - 1; i >= 0; i--) { 518 for (i = apicid - 1; i >= 0; i--) {
513 int node = apicid_to_node[i]; 519 node = apicid_to_node[i];
514 if (node != NUMA_NO_NODE && node_online(node)) 520 if (node != NUMA_NO_NODE && node_online(node))
515 return node; 521 return node;
516 } 522 }
517 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 523 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
518 int node = apicid_to_node[i]; 524 node = apicid_to_node[i];
519 if (node != NUMA_NO_NODE && node_online(node)) 525 if (node != NUMA_NO_NODE && node_online(node))
520 return node; 526 return node;
521 } 527 }
@@ -527,7 +533,7 @@ static int nearby_node(int apicid)
527 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 533 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
528 * Assumes number of cores is a power of two. 534 * Assumes number of cores is a power of two.
529 */ 535 */
530static void __init amd_detect_cmp(struct cpuinfo_x86 *c) 536static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
531{ 537{
532#ifdef CONFIG_SMP 538#ifdef CONFIG_SMP
533 unsigned bits; 539 unsigned bits;
@@ -536,7 +542,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
536 int node = 0; 542 int node = 0;
537 unsigned apicid = hard_smp_processor_id(); 543 unsigned apicid = hard_smp_processor_id();
538#endif 544#endif
539 unsigned ecx = cpuid_ecx(0x80000008); 545 bits = c->x86_coreid_bits;
546
547 /* Low order bits define the core id (index of core in socket) */
548 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
549 /* Convert the APIC ID into the socket ID */
550 c->phys_proc_id = phys_pkg_id(bits);
551
552#ifdef CONFIG_NUMA
553 node = c->phys_proc_id;
554 if (apicid_to_node[apicid] != NUMA_NO_NODE)
555 node = apicid_to_node[apicid];
556 if (!node_online(node)) {
557 /* Two possibilities here:
558 - The CPU is missing memory and no node was created.
559 In that case try picking one from a nearby CPU
560 - The APIC IDs differ from the HyperTransport node IDs
561 which the K8 northbridge parsing fills in.
562 Assume they are all increased by a constant offset,
563 but in the same order as the HT nodeids.
564 If that doesn't result in a usable node fall back to the
565 path for the previous case. */
566
567 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
568
569 if (ht_nodeid >= 0 &&
570 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
571 node = apicid_to_node[ht_nodeid];
572 /* Pick a nearby node */
573 if (!node_online(node))
574 node = nearby_node(apicid);
575 }
576 numa_set_node(cpu, node);
577
578 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
579#endif
580#endif
581}
582
583static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
584{
585#ifdef CONFIG_SMP
586 unsigned bits, ecx;
587
588 /* Multi core CPU? */
589 if (c->extended_cpuid_level < 0x80000008)
590 return;
591
592 ecx = cpuid_ecx(0x80000008);
540 593
541 c->x86_max_cores = (ecx & 0xff) + 1; 594 c->x86_max_cores = (ecx & 0xff) + 1;
542 595
@@ -549,37 +602,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
549 bits++; 602 bits++;
550 } 603 }
551 604
552 /* Low order bits define the core id (index of core in socket) */ 605 c->x86_coreid_bits = bits;
553 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
554 /* Convert the APIC ID into the socket ID */
555 c->phys_proc_id = phys_pkg_id(bits);
556
557#ifdef CONFIG_NUMA
558 node = c->phys_proc_id;
559 if (apicid_to_node[apicid] != NUMA_NO_NODE)
560 node = apicid_to_node[apicid];
561 if (!node_online(node)) {
562 /* Two possibilities here:
563 - The CPU is missing memory and no node was created.
564 In that case try picking one from a nearby CPU
565 - The APIC IDs differ from the HyperTransport node IDs
566 which the K8 northbridge parsing fills in.
567 Assume they are all increased by a constant offset,
568 but in the same order as the HT nodeids.
569 If that doesn't result in a usable node fall back to the
570 path for the previous case. */
571 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
572 if (ht_nodeid >= 0 &&
573 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
574 node = apicid_to_node[ht_nodeid];
575 /* Pick a nearby node */
576 if (!node_online(node))
577 node = nearby_node(apicid);
578 }
579 numa_set_node(cpu, node);
580 606
581 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
582#endif
583#endif 607#endif
584} 608}
585 609
@@ -595,8 +619,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
595/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ 619/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
596static __cpuinit int amd_apic_timer_broken(void) 620static __cpuinit int amd_apic_timer_broken(void)
597{ 621{
598 u32 lo, hi; 622 u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
599 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 623
600 switch (eax & CPUID_XFAM) { 624 switch (eax & CPUID_XFAM) {
601 case CPUID_XFAM_K8: 625 case CPUID_XFAM_K8:
602 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) 626 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -614,6 +638,15 @@ static __cpuinit int amd_apic_timer_broken(void)
614 return 0; 638 return 0;
615} 639}
616 640
641static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
642{
643 early_init_amd_mc(c);
644
645 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
646 if (c->x86_power & (1<<8))
647 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
648}
649
617static void __cpuinit init_amd(struct cpuinfo_x86 *c) 650static void __cpuinit init_amd(struct cpuinfo_x86 *c)
618{ 651{
619 unsigned level; 652 unsigned level;
@@ -624,7 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
624 /* 657 /*
625 * Disable TLB flush filter by setting HWCR.FFDIS on K8 658 * Disable TLB flush filter by setting HWCR.FFDIS on K8
626 * bit 6 of msr C001_0015 659 * bit 6 of msr C001_0015
627 * 660 *
628 * Errata 63 for SH-B3 steppings 661 * Errata 63 for SH-B3 steppings
629 * Errata 122 for all steppings (F+ have it disabled by default) 662 * Errata 122 for all steppings (F+ have it disabled by default)
630 */ 663 */
@@ -637,35 +670,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
637 670
638 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 671 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
639 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ 672 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
640 clear_bit(0*32+31, &c->x86_capability); 673 clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
641 674
642 /* On C+ stepping K8 rep microcode works well for copy/memset */ 675 /* On C+ stepping K8 rep microcode works well for copy/memset */
643 level = cpuid_eax(1); 676 level = cpuid_eax(1);
644 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) 677 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
645 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 678 level >= 0x0f58))
679 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
646 if (c->x86 == 0x10 || c->x86 == 0x11) 680 if (c->x86 == 0x10 || c->x86 == 0x11)
647 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 681 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
648 682
649 /* Enable workaround for FXSAVE leak */ 683 /* Enable workaround for FXSAVE leak */
650 if (c->x86 >= 6) 684 if (c->x86 >= 6)
651 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); 685 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
652 686
653 level = get_model_name(c); 687 level = get_model_name(c);
654 if (!level) { 688 if (!level) {
655 switch (c->x86) { 689 switch (c->x86) {
656 case 15: 690 case 15:
657 /* Should distinguish Models here, but this is only 691 /* Should distinguish Models here, but this is only
658 a fallback anyways. */ 692 a fallback anyways. */
659 strcpy(c->x86_model_id, "Hammer"); 693 strcpy(c->x86_model_id, "Hammer");
660 break; 694 break;
661 } 695 }
662 } 696 }
663 display_cacheinfo(c); 697 display_cacheinfo(c);
664 698
665 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
666 if (c->x86_power & (1<<8))
667 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
668
669 /* Multi core CPU? */ 699 /* Multi core CPU? */
670 if (c->extended_cpuid_level >= 0x80000008) 700 if (c->extended_cpuid_level >= 0x80000008)
671 amd_detect_cmp(c); 701 amd_detect_cmp(c);
@@ -677,41 +707,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
677 num_cache_leaves = 3; 707 num_cache_leaves = 3;
678 708
679 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) 709 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
680 set_bit(X86_FEATURE_K8, &c->x86_capability); 710 set_cpu_cap(c, X86_FEATURE_K8);
681
682 /* RDTSC can be speculated around */
683 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
684 711
685 /* Family 10 doesn't support C states in MWAIT so don't use it */ 712 /* MFENCE stops RDTSC speculation */
686 if (c->x86 == 0x10 && !force_mwait) 713 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
687 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
688 714
689 if (amd_apic_timer_broken()) 715 if (amd_apic_timer_broken())
690 disable_apic_timer = 1; 716 disable_apic_timer = 1;
691} 717}
692 718
693static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 719void __cpuinit detect_ht(struct cpuinfo_x86 *c)
694{ 720{
695#ifdef CONFIG_SMP 721#ifdef CONFIG_SMP
696 u32 eax, ebx, ecx, edx; 722 u32 eax, ebx, ecx, edx;
697 int index_msb, core_bits; 723 int index_msb, core_bits;
698 724
699 cpuid(1, &eax, &ebx, &ecx, &edx); 725 cpuid(1, &eax, &ebx, &ecx, &edx);
700 726
701 727
702 if (!cpu_has(c, X86_FEATURE_HT)) 728 if (!cpu_has(c, X86_FEATURE_HT))
703 return; 729 return;
704 if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) 730 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
705 goto out; 731 goto out;
706 732
707 smp_num_siblings = (ebx & 0xff0000) >> 16; 733 smp_num_siblings = (ebx & 0xff0000) >> 16;
708 734
709 if (smp_num_siblings == 1) { 735 if (smp_num_siblings == 1) {
710 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 736 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
711 } else if (smp_num_siblings > 1 ) { 737 } else if (smp_num_siblings > 1) {
712 738
713 if (smp_num_siblings > NR_CPUS) { 739 if (smp_num_siblings > NR_CPUS) {
714 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); 740 printk(KERN_WARNING "CPU: Unsupported number of "
741 "siblings %d", smp_num_siblings);
715 smp_num_siblings = 1; 742 smp_num_siblings = 1;
716 return; 743 return;
717 } 744 }
@@ -721,7 +748,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
721 748
722 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 749 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
723 750
724 index_msb = get_count_order(smp_num_siblings) ; 751 index_msb = get_count_order(smp_num_siblings);
725 752
726 core_bits = get_count_order(c->x86_max_cores); 753 core_bits = get_count_order(c->x86_max_cores);
727 754
@@ -730,8 +757,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
730 } 757 }
731out: 758out:
732 if ((c->x86_max_cores * smp_num_siblings) > 1) { 759 if ((c->x86_max_cores * smp_num_siblings) > 1) {
733 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); 760 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
734 printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); 761 c->phys_proc_id);
762 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
763 c->cpu_core_id);
735 } 764 }
736 765
737#endif 766#endif
@@ -773,28 +802,39 @@ static void srat_detect_node(void)
773#endif 802#endif
774} 803}
775 804
805static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
806{
807 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
808 (c->x86 == 0x6 && c->x86_model >= 0x0e))
809 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
810}
811
776static void __cpuinit init_intel(struct cpuinfo_x86 *c) 812static void __cpuinit init_intel(struct cpuinfo_x86 *c)
777{ 813{
778 /* Cache sizes */ 814 /* Cache sizes */
779 unsigned n; 815 unsigned n;
780 816
781 init_intel_cacheinfo(c); 817 init_intel_cacheinfo(c);
782 if (c->cpuid_level > 9 ) { 818 if (c->cpuid_level > 9) {
783 unsigned eax = cpuid_eax(10); 819 unsigned eax = cpuid_eax(10);
784 /* Check for version and the number of counters */ 820 /* Check for version and the number of counters */
785 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) 821 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
786 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); 822 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
787 } 823 }
788 824
789 if (cpu_has_ds) { 825 if (cpu_has_ds) {
790 unsigned int l1, l2; 826 unsigned int l1, l2;
791 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 827 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
792 if (!(l1 & (1<<11))) 828 if (!(l1 & (1<<11)))
793 set_bit(X86_FEATURE_BTS, c->x86_capability); 829 set_cpu_cap(c, X86_FEATURE_BTS);
794 if (!(l1 & (1<<12))) 830 if (!(l1 & (1<<12)))
795 set_bit(X86_FEATURE_PEBS, c->x86_capability); 831 set_cpu_cap(c, X86_FEATURE_PEBS);
796 } 832 }
797 833
834
835 if (cpu_has_bts)
836 ds_init_intel(c);
837
798 n = c->extended_cpuid_level; 838 n = c->extended_cpuid_level;
799 if (n >= 0x80000008) { 839 if (n >= 0x80000008) {
800 unsigned eax = cpuid_eax(0x80000008); 840 unsigned eax = cpuid_eax(0x80000008);
@@ -811,14 +851,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
811 c->x86_cache_alignment = c->x86_clflush_size * 2; 851 c->x86_cache_alignment = c->x86_clflush_size * 2;
812 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 852 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
813 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 853 (c->x86 == 0x6 && c->x86_model >= 0x0e))
814 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 854 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
815 if (c->x86 == 6) 855 if (c->x86 == 6)
816 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 856 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
817 if (c->x86 == 15) 857 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
818 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 858 c->x86_max_cores = intel_num_cpu_cores(c);
819 else
820 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
821 c->x86_max_cores = intel_num_cpu_cores(c);
822 859
823 srat_detect_node(); 860 srat_detect_node();
824} 861}
@@ -835,18 +872,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
835 c->x86_vendor = X86_VENDOR_UNKNOWN; 872 c->x86_vendor = X86_VENDOR_UNKNOWN;
836} 873}
837 874
838struct cpu_model_info {
839 int vendor;
840 int family;
841 char *model_names[16];
842};
843
844/* Do some early cpuid on the boot CPU to get some parameter that are 875/* Do some early cpuid on the boot CPU to get some parameter that are
845 needed before check_bugs. Everything advanced is in identify_cpu 876 needed before check_bugs. Everything advanced is in identify_cpu
846 below. */ 877 below. */
847void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) 878static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
848{ 879{
849 u32 tfms; 880 u32 tfms, xlvl;
850 881
851 c->loops_per_jiffy = loops_per_jiffy; 882 c->loops_per_jiffy = loops_per_jiffy;
852 c->x86_cache_size = -1; 883 c->x86_cache_size = -1;
@@ -857,6 +888,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
857 c->x86_clflush_size = 64; 888 c->x86_clflush_size = 64;
858 c->x86_cache_alignment = c->x86_clflush_size; 889 c->x86_cache_alignment = c->x86_clflush_size;
859 c->x86_max_cores = 1; 890 c->x86_max_cores = 1;
891 c->x86_coreid_bits = 0;
860 c->extended_cpuid_level = 0; 892 c->extended_cpuid_level = 0;
861 memset(&c->x86_capability, 0, sizeof c->x86_capability); 893 memset(&c->x86_capability, 0, sizeof c->x86_capability);
862 894
@@ -865,7 +897,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
865 (unsigned int *)&c->x86_vendor_id[0], 897 (unsigned int *)&c->x86_vendor_id[0],
866 (unsigned int *)&c->x86_vendor_id[8], 898 (unsigned int *)&c->x86_vendor_id[8],
867 (unsigned int *)&c->x86_vendor_id[4]); 899 (unsigned int *)&c->x86_vendor_id[4]);
868 900
869 get_cpu_vendor(c); 901 get_cpu_vendor(c);
870 902
871 /* Initialize the standard set of capabilities */ 903 /* Initialize the standard set of capabilities */
@@ -883,7 +915,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
883 c->x86 += (tfms >> 20) & 0xff; 915 c->x86 += (tfms >> 20) & 0xff;
884 if (c->x86 >= 0x6) 916 if (c->x86 >= 0x6)
885 c->x86_model += ((tfms >> 16) & 0xF) << 4; 917 c->x86_model += ((tfms >> 16) & 0xF) << 4;
886 if (c->x86_capability[0] & (1<<19)) 918 if (c->x86_capability[0] & (1<<19))
887 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 919 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
888 } else { 920 } else {
889 /* Have CPUID level 0 only - unheard of */ 921 /* Have CPUID level 0 only - unheard of */
@@ -893,18 +925,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
893#ifdef CONFIG_SMP 925#ifdef CONFIG_SMP
894 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 926 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
895#endif 927#endif
896}
897
898/*
899 * This does the hard work of actually picking apart the CPU stuff...
900 */
901void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
902{
903 int i;
904 u32 xlvl;
905
906 early_identify_cpu(c);
907
908 /* AMD-defined flags: level 0x80000001 */ 928 /* AMD-defined flags: level 0x80000001 */
909 xlvl = cpuid_eax(0x80000000); 929 xlvl = cpuid_eax(0x80000000);
910 c->extended_cpuid_level = xlvl; 930 c->extended_cpuid_level = xlvl;
@@ -925,6 +945,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
925 c->x86_capability[2] = cpuid_edx(0x80860001); 945 c->x86_capability[2] = cpuid_edx(0x80860001);
926 } 946 }
927 947
948 c->extended_cpuid_level = cpuid_eax(0x80000000);
949 if (c->extended_cpuid_level >= 0x80000007)
950 c->x86_power = cpuid_edx(0x80000007);
951
952 switch (c->x86_vendor) {
953 case X86_VENDOR_AMD:
954 early_init_amd(c);
955 break;
956 case X86_VENDOR_INTEL:
957 early_init_intel(c);
958 break;
959 }
960
961}
962
963/*
964 * This does the hard work of actually picking apart the CPU stuff...
965 */
966void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
967{
968 int i;
969
970 early_identify_cpu(c);
971
928 init_scattered_cpuid_features(c); 972 init_scattered_cpuid_features(c);
929 973
930 c->apicid = phys_pkg_id(0); 974 c->apicid = phys_pkg_id(0);
@@ -954,8 +998,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
954 break; 998 break;
955 } 999 }
956 1000
957 select_idle_routine(c); 1001 detect_ht(c);
958 detect_ht(c);
959 1002
960 /* 1003 /*
961 * On SMP, boot_cpu_data holds the common feature set between 1004 * On SMP, boot_cpu_data holds the common feature set between
@@ -965,32 +1008,56 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
965 */ 1008 */
966 if (c != &boot_cpu_data) { 1009 if (c != &boot_cpu_data) {
967 /* AND the already accumulated flags with these */ 1010 /* AND the already accumulated flags with these */
968 for (i = 0 ; i < NCAPINTS ; i++) 1011 for (i = 0; i < NCAPINTS; i++)
969 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 1012 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
970 } 1013 }
971 1014
1015 /* Clear all flags overriden by options */
1016 for (i = 0; i < NCAPINTS; i++)
1017 c->x86_capability[i] ^= cleared_cpu_caps[i];
1018
972#ifdef CONFIG_X86_MCE 1019#ifdef CONFIG_X86_MCE
973 mcheck_init(c); 1020 mcheck_init(c);
974#endif 1021#endif
1022 select_idle_routine(c);
1023
975 if (c != &boot_cpu_data) 1024 if (c != &boot_cpu_data)
976 mtrr_ap_init(); 1025 mtrr_ap_init();
977#ifdef CONFIG_NUMA 1026#ifdef CONFIG_NUMA
978 numa_add_cpu(smp_processor_id()); 1027 numa_add_cpu(smp_processor_id());
979#endif 1028#endif
1029
1030}
1031
1032static __init int setup_noclflush(char *arg)
1033{
1034 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1035 return 1;
980} 1036}
981 1037__setup("noclflush", setup_noclflush);
982 1038
983void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 1039void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
984{ 1040{
985 if (c->x86_model_id[0]) 1041 if (c->x86_model_id[0])
986 printk("%s", c->x86_model_id); 1042 printk(KERN_INFO "%s", c->x86_model_id);
987 1043
988 if (c->x86_mask || c->cpuid_level >= 0) 1044 if (c->x86_mask || c->cpuid_level >= 0)
989 printk(" stepping %02x\n", c->x86_mask); 1045 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
990 else 1046 else
991 printk("\n"); 1047 printk(KERN_CONT "\n");
992} 1048}
993 1049
1050static __init int setup_disablecpuid(char *arg)
1051{
1052 int bit;
1053 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1054 setup_clear_cpu_cap(bit);
1055 else
1056 return 0;
1057 return 1;
1058}
1059__setup("clearcpuid=", setup_disablecpuid);
1060
994/* 1061/*
995 * Get CPU information for use by the procfs. 1062 * Get CPU information for use by the procfs.
996 */ 1063 */
@@ -998,9 +1065,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
998static int show_cpuinfo(struct seq_file *m, void *v) 1065static int show_cpuinfo(struct seq_file *m, void *v)
999{ 1066{
1000 struct cpuinfo_x86 *c = v; 1067 struct cpuinfo_x86 *c = v;
1001 int cpu = 0; 1068 int cpu = 0, i;
1002 1069
1003 /* 1070 /*
1004 * These flag bits must match the definitions in <asm/cpufeature.h>. 1071 * These flag bits must match the definitions in <asm/cpufeature.h>.
1005 * NULL means this bit is undefined or reserved; either way it doesn't 1072 * NULL means this bit is undefined or reserved; either way it doesn't
1006 * have meaning as far as Linux is concerned. Note that it's important 1073 * have meaning as far as Linux is concerned. Note that it's important
@@ -1010,10 +1077,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1010 */ 1077 */
1011 static const char *const x86_cap_flags[] = { 1078 static const char *const x86_cap_flags[] = {
1012 /* Intel-defined */ 1079 /* Intel-defined */
1013 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", 1080 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
1014 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", 1081 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
1015 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", 1082 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
1016 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", 1083 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
1017 1084
1018 /* AMD-defined */ 1085 /* AMD-defined */
1019 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1086 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1080,34 +1147,35 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1080 cpu = c->cpu_index; 1147 cpu = c->cpu_index;
1081#endif 1148#endif
1082 1149
1083 seq_printf(m,"processor\t: %u\n" 1150 seq_printf(m, "processor\t: %u\n"
1084 "vendor_id\t: %s\n" 1151 "vendor_id\t: %s\n"
1085 "cpu family\t: %d\n" 1152 "cpu family\t: %d\n"
1086 "model\t\t: %d\n" 1153 "model\t\t: %d\n"
1087 "model name\t: %s\n", 1154 "model name\t: %s\n",
1088 (unsigned)cpu, 1155 (unsigned)cpu,
1089 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", 1156 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1090 c->x86, 1157 c->x86,
1091 (int)c->x86_model, 1158 (int)c->x86_model,
1092 c->x86_model_id[0] ? c->x86_model_id : "unknown"); 1159 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1093 1160
1094 if (c->x86_mask || c->cpuid_level >= 0) 1161 if (c->x86_mask || c->cpuid_level >= 0)
1095 seq_printf(m, "stepping\t: %d\n", c->x86_mask); 1162 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1096 else 1163 else
1097 seq_printf(m, "stepping\t: unknown\n"); 1164 seq_printf(m, "stepping\t: unknown\n");
1098 1165
1099 if (cpu_has(c,X86_FEATURE_TSC)) { 1166 if (cpu_has(c, X86_FEATURE_TSC)) {
1100 unsigned int freq = cpufreq_quick_get((unsigned)cpu); 1167 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
1168
1101 if (!freq) 1169 if (!freq)
1102 freq = cpu_khz; 1170 freq = cpu_khz;
1103 seq_printf(m, "cpu MHz\t\t: %u.%03u\n", 1171 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1104 freq / 1000, (freq % 1000)); 1172 freq / 1000, (freq % 1000));
1105 } 1173 }
1106 1174
1107 /* Cache size */ 1175 /* Cache size */
1108 if (c->x86_cache_size >= 0) 1176 if (c->x86_cache_size >= 0)
1109 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); 1177 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1110 1178
1111#ifdef CONFIG_SMP 1179#ifdef CONFIG_SMP
1112 if (smp_num_siblings * c->x86_max_cores > 1) { 1180 if (smp_num_siblings * c->x86_max_cores > 1) {
1113 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 1181 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1116,48 +1184,43 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1116 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 1184 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
1117 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 1185 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1118 } 1186 }
1119#endif 1187#endif
1120 1188
1121 seq_printf(m, 1189 seq_printf(m,
1122 "fpu\t\t: yes\n" 1190 "fpu\t\t: yes\n"
1123 "fpu_exception\t: yes\n" 1191 "fpu_exception\t: yes\n"
1124 "cpuid level\t: %d\n" 1192 "cpuid level\t: %d\n"
1125 "wp\t\t: yes\n" 1193 "wp\t\t: yes\n"
1126 "flags\t\t:", 1194 "flags\t\t:",
1127 c->cpuid_level); 1195 c->cpuid_level);
1128 1196
1129 { 1197 for (i = 0; i < 32*NCAPINTS; i++)
1130 int i; 1198 if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
1131 for ( i = 0 ; i < 32*NCAPINTS ; i++ ) 1199 seq_printf(m, " %s", x86_cap_flags[i]);
1132 if (cpu_has(c, i) && x86_cap_flags[i] != NULL) 1200
1133 seq_printf(m, " %s", x86_cap_flags[i]);
1134 }
1135
1136 seq_printf(m, "\nbogomips\t: %lu.%02lu\n", 1201 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1137 c->loops_per_jiffy/(500000/HZ), 1202 c->loops_per_jiffy/(500000/HZ),
1138 (c->loops_per_jiffy/(5000/HZ)) % 100); 1203 (c->loops_per_jiffy/(5000/HZ)) % 100);
1139 1204
1140 if (c->x86_tlbsize > 0) 1205 if (c->x86_tlbsize > 0)
1141 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); 1206 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1142 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); 1207 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1143 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); 1208 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1144 1209
1145 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 1210 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1146 c->x86_phys_bits, c->x86_virt_bits); 1211 c->x86_phys_bits, c->x86_virt_bits);
1147 1212
1148 seq_printf(m, "power management:"); 1213 seq_printf(m, "power management:");
1149 { 1214 for (i = 0; i < 32; i++) {
1150 unsigned i; 1215 if (c->x86_power & (1 << i)) {
1151 for (i = 0; i < 32; i++) 1216 if (i < ARRAY_SIZE(x86_power_flags) &&
1152 if (c->x86_power & (1 << i)) { 1217 x86_power_flags[i])
1153 if (i < ARRAY_SIZE(x86_power_flags) && 1218 seq_printf(m, "%s%s",
1154 x86_power_flags[i]) 1219 x86_power_flags[i][0]?" ":"",
1155 seq_printf(m, "%s%s", 1220 x86_power_flags[i]);
1156 x86_power_flags[i][0]?" ":"", 1221 else
1157 x86_power_flags[i]); 1222 seq_printf(m, " [%d]", i);
1158 else 1223 }
1159 seq_printf(m, " [%d]", i);
1160 }
1161 } 1224 }
1162 1225
1163 seq_printf(m, "\n\n"); 1226 seq_printf(m, "\n\n");
@@ -1184,8 +1247,8 @@ static void c_stop(struct seq_file *m, void *v)
1184{ 1247{
1185} 1248}
1186 1249
1187struct seq_operations cpuinfo_op = { 1250const struct seq_operations cpuinfo_op = {
1188 .start =c_start, 1251 .start = c_start,
1189 .next = c_next, 1252 .next = c_next,
1190 .stop = c_stop, 1253 .stop = c_stop,
1191 .show = show_cpuinfo, 1254 .show = show_cpuinfo,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 20f29e4c1d33..caee1f002fed 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -23,6 +23,7 @@
23#include <asm/ucontext.h> 23#include <asm/ucontext.h>
24#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h>
26#include "sigframe_32.h" 27#include "sigframe_32.h"
27 28
28#define DEBUG_SIG 0 29#define DEBUG_SIG 0
@@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
81} 82}
82 83
83asmlinkage int 84asmlinkage int
84sys_sigaltstack(unsigned long ebx) 85sys_sigaltstack(unsigned long bx)
85{ 86{
86 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ 87 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
87 struct pt_regs *regs = (struct pt_regs *)&ebx; 88 struct pt_regs *regs = (struct pt_regs *)&bx;
88 const stack_t __user *uss = (const stack_t __user *)ebx; 89 const stack_t __user *uss = (const stack_t __user *)bx;
89 stack_t __user *uoss = (stack_t __user *)regs->ecx; 90 stack_t __user *uoss = (stack_t __user *)regs->cx;
90 91
91 return do_sigaltstack(uss, uoss, regs->esp); 92 return do_sigaltstack(uss, uoss, regs->sp);
92} 93}
93 94
94 95
@@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
109#define COPY_SEG(seg) \ 110#define COPY_SEG(seg) \
110 { unsigned short tmp; \ 111 { unsigned short tmp; \
111 err |= __get_user(tmp, &sc->seg); \ 112 err |= __get_user(tmp, &sc->seg); \
112 regs->x##seg = tmp; } 113 regs->seg = tmp; }
113 114
114#define COPY_SEG_STRICT(seg) \ 115#define COPY_SEG_STRICT(seg) \
115 { unsigned short tmp; \ 116 { unsigned short tmp; \
116 err |= __get_user(tmp, &sc->seg); \ 117 err |= __get_user(tmp, &sc->seg); \
117 regs->x##seg = tmp|3; } 118 regs->seg = tmp|3; }
118 119
119#define GET_SEG(seg) \ 120#define GET_SEG(seg) \
120 { unsigned short tmp; \ 121 { unsigned short tmp; \
@@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
130 COPY_SEG(fs); 131 COPY_SEG(fs);
131 COPY_SEG(es); 132 COPY_SEG(es);
132 COPY_SEG(ds); 133 COPY_SEG(ds);
133 COPY(edi); 134 COPY(di);
134 COPY(esi); 135 COPY(si);
135 COPY(ebp); 136 COPY(bp);
136 COPY(esp); 137 COPY(sp);
137 COPY(ebx); 138 COPY(bx);
138 COPY(edx); 139 COPY(dx);
139 COPY(ecx); 140 COPY(cx);
140 COPY(eip); 141 COPY(ip);
141 COPY_SEG_STRICT(cs); 142 COPY_SEG_STRICT(cs);
142 COPY_SEG_STRICT(ss); 143 COPY_SEG_STRICT(ss);
143 144
144 { 145 {
145 unsigned int tmpflags; 146 unsigned int tmpflags;
146 err |= __get_user(tmpflags, &sc->eflags); 147 err |= __get_user(tmpflags, &sc->flags);
147 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 148 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
148 regs->orig_eax = -1; /* disable syscall checks */ 149 regs->orig_ax = -1; /* disable syscall checks */
149 } 150 }
150 151
151 { 152 {
@@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
164 } 165 }
165 } 166 }
166 167
167 err |= __get_user(*peax, &sc->eax); 168 err |= __get_user(*peax, &sc->ax);
168 return err; 169 return err;
169 170
170badframe: 171badframe:
@@ -174,9 +175,9 @@ badframe:
174asmlinkage int sys_sigreturn(unsigned long __unused) 175asmlinkage int sys_sigreturn(unsigned long __unused)
175{ 176{
176 struct pt_regs *regs = (struct pt_regs *) &__unused; 177 struct pt_regs *regs = (struct pt_regs *) &__unused;
177 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); 178 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
178 sigset_t set; 179 sigset_t set;
179 int eax; 180 int ax;
180 181
181 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
182 goto badframe; 183 goto badframe;
@@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
192 recalc_sigpending(); 193 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock); 194 spin_unlock_irq(&current->sighand->siglock);
194 195
195 if (restore_sigcontext(regs, &frame->sc, &eax)) 196 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe; 197 goto badframe;
197 return eax; 198 return ax;
198 199
199badframe: 200badframe:
200 if (show_unhandled_signals && printk_ratelimit()) 201 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" 202 printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
202 " esp:%lx oeax:%lx\n", 203 " sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 204 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->eip, 205 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->esp, regs->orig_eax); 206 regs->sp, regs->orig_ax);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
206 210
207 force_sig(SIGSEGV, current); 211 force_sig(SIGSEGV, current);
208 return 0; 212 return 0;
@@ -211,9 +215,9 @@ badframe:
211asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215asmlinkage int sys_rt_sigreturn(unsigned long __unused)
212{ 216{
213 struct pt_regs *regs = (struct pt_regs *) &__unused; 217 struct pt_regs *regs = (struct pt_regs *) &__unused;
214 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); 218 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
215 sigset_t set; 219 sigset_t set;
216 int eax; 220 int ax;
217 221
218 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
219 goto badframe; 223 goto badframe;
@@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
226 recalc_sigpending(); 230 recalc_sigpending();
227 spin_unlock_irq(&current->sighand->siglock); 231 spin_unlock_irq(&current->sighand->siglock);
228 232
229 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
230 goto badframe; 234 goto badframe;
231 235
232 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) 236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
233 goto badframe; 237 goto badframe;
234 238
235 return eax; 239 return ax;
236 240
237badframe: 241badframe:
238 force_sig(SIGSEGV, current); 242 force_sig(SIGSEGV, current);
@@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
249{ 253{
250 int tmp, err = 0; 254 int tmp, err = 0;
251 255
252 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); 256 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
253 savesegment(gs, tmp); 257 savesegment(gs, tmp);
254 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 258 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
255 259
256 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 260 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
257 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 261 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
258 err |= __put_user(regs->edi, &sc->edi); 262 err |= __put_user(regs->di, &sc->di);
259 err |= __put_user(regs->esi, &sc->esi); 263 err |= __put_user(regs->si, &sc->si);
260 err |= __put_user(regs->ebp, &sc->ebp); 264 err |= __put_user(regs->bp, &sc->bp);
261 err |= __put_user(regs->esp, &sc->esp); 265 err |= __put_user(regs->sp, &sc->sp);
262 err |= __put_user(regs->ebx, &sc->ebx); 266 err |= __put_user(regs->bx, &sc->bx);
263 err |= __put_user(regs->edx, &sc->edx); 267 err |= __put_user(regs->dx, &sc->dx);
264 err |= __put_user(regs->ecx, &sc->ecx); 268 err |= __put_user(regs->cx, &sc->cx);
265 err |= __put_user(regs->eax, &sc->eax); 269 err |= __put_user(regs->ax, &sc->ax);
266 err |= __put_user(current->thread.trap_no, &sc->trapno); 270 err |= __put_user(current->thread.trap_no, &sc->trapno);
267 err |= __put_user(current->thread.error_code, &sc->err); 271 err |= __put_user(current->thread.error_code, &sc->err);
268 err |= __put_user(regs->eip, &sc->eip); 272 err |= __put_user(regs->ip, &sc->ip);
269 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); 273 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
270 err |= __put_user(regs->eflags, &sc->eflags); 274 err |= __put_user(regs->flags, &sc->flags);
271 err |= __put_user(regs->esp, &sc->esp_at_signal); 275 err |= __put_user(regs->sp, &sc->sp_at_signal);
272 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); 276 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
273 277
274 tmp = save_i387(fpstate); 278 tmp = save_i387(fpstate);
275 if (tmp < 0) 279 if (tmp < 0)
@@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
290static inline void __user * 294static inline void __user *
291get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) 295get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
292{ 296{
293 unsigned long esp; 297 unsigned long sp;
294 298
295 /* Default to using normal stack */ 299 /* Default to using normal stack */
296 esp = regs->esp; 300 sp = regs->sp;
301
302 /*
303 * If we are on the alternate signal stack and would overflow it, don't.
304 * Return an always-bogus address instead so we will die with SIGSEGV.
305 */
306 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
307 return (void __user *) -1L;
297 308
298 /* This is the X/Open sanctioned signal stack switching. */ 309 /* This is the X/Open sanctioned signal stack switching. */
299 if (ka->sa.sa_flags & SA_ONSTACK) { 310 if (ka->sa.sa_flags & SA_ONSTACK) {
300 if (sas_ss_flags(esp) == 0) 311 if (sas_ss_flags(sp) == 0)
301 esp = current->sas_ss_sp + current->sas_ss_size; 312 sp = current->sas_ss_sp + current->sas_ss_size;
302 } 313 }
303 314
304 /* This is the legacy signal stack switching. */ 315 /* This is the legacy signal stack switching. */
305 else if ((regs->xss & 0xffff) != __USER_DS && 316 else if ((regs->ss & 0xffff) != __USER_DS &&
306 !(ka->sa.sa_flags & SA_RESTORER) && 317 !(ka->sa.sa_flags & SA_RESTORER) &&
307 ka->sa.sa_restorer) { 318 ka->sa.sa_restorer) {
308 esp = (unsigned long) ka->sa.sa_restorer; 319 sp = (unsigned long) ka->sa.sa_restorer;
309 } 320 }
310 321
311 esp -= frame_size; 322 sp -= frame_size;
312 /* Align the stack pointer according to the i386 ABI, 323 /* Align the stack pointer according to the i386 ABI,
313 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 324 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
314 esp = ((esp + 4) & -16ul) - 4; 325 sp = ((sp + 4) & -16ul) - 4;
315 return (void __user *) esp; 326 return (void __user *) sp;
316} 327}
317 328
318/* These symbols are defined with the addresses in the vsyscall page. 329/* These symbols are defined with the addresses in the vsyscall page.
@@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka,
355 } 366 }
356 367
357 if (current->binfmt->hasvdso) 368 if (current->binfmt->hasvdso)
358 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 369 restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
359 else 370 else
360 restorer = (void *)&frame->retcode; 371 restorer = &frame->retcode;
361 if (ka->sa.sa_flags & SA_RESTORER) 372 if (ka->sa.sa_flags & SA_RESTORER)
362 restorer = ka->sa.sa_restorer; 373 restorer = ka->sa.sa_restorer;
363 374
@@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
379 goto give_sigsegv; 390 goto give_sigsegv;
380 391
381 /* Set up registers for signal handler */ 392 /* Set up registers for signal handler */
382 regs->esp = (unsigned long) frame; 393 regs->sp = (unsigned long) frame;
383 regs->eip = (unsigned long) ka->sa.sa_handler; 394 regs->ip = (unsigned long) ka->sa.sa_handler;
384 regs->eax = (unsigned long) sig; 395 regs->ax = (unsigned long) sig;
385 regs->edx = (unsigned long) 0; 396 regs->dx = (unsigned long) 0;
386 regs->ecx = (unsigned long) 0; 397 regs->cx = (unsigned long) 0;
387 398
388 regs->xds = __USER_DS; 399 regs->ds = __USER_DS;
389 regs->xes = __USER_DS; 400 regs->es = __USER_DS;
390 regs->xss = __USER_DS; 401 regs->ss = __USER_DS;
391 regs->xcs = __USER_CS; 402 regs->cs = __USER_CS;
392 403
393 /* 404 /*
394 * Clear TF when entering the signal handler, but 405 * Clear TF when entering the signal handler, but
@@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
396 * The tracer may want to single-step inside the 407 * The tracer may want to single-step inside the
397 * handler too. 408 * handler too.
398 */ 409 */
399 regs->eflags &= ~TF_MASK; 410 regs->flags &= ~TF_MASK;
400 if (test_thread_flag(TIF_SINGLESTEP)) 411 if (test_thread_flag(TIF_SINGLESTEP))
401 ptrace_notify(SIGTRAP); 412 ptrace_notify(SIGTRAP);
402 413
403#if DEBUG_SIG 414#if DEBUG_SIG
404 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 415 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
405 current->comm, current->pid, frame, regs->eip, frame->pretcode); 416 current->comm, current->pid, frame, regs->ip, frame->pretcode);
406#endif 417#endif
407 418
408 return 0; 419 return 0;
@@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
442 err |= __put_user(0, &frame->uc.uc_flags); 453 err |= __put_user(0, &frame->uc.uc_flags);
443 err |= __put_user(0, &frame->uc.uc_link); 454 err |= __put_user(0, &frame->uc.uc_link);
444 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 455 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
445 err |= __put_user(sas_ss_flags(regs->esp), 456 err |= __put_user(sas_ss_flags(regs->sp),
446 &frame->uc.uc_stack.ss_flags); 457 &frame->uc.uc_stack.ss_flags);
447 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 458 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
448 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 459 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
452 goto give_sigsegv; 463 goto give_sigsegv;
453 464
454 /* Set up to return from userspace. */ 465 /* Set up to return from userspace. */
455 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); 466 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
456 if (ka->sa.sa_flags & SA_RESTORER) 467 if (ka->sa.sa_flags & SA_RESTORER)
457 restorer = ka->sa.sa_restorer; 468 restorer = ka->sa.sa_restorer;
458 err |= __put_user(restorer, &frame->pretcode); 469 err |= __put_user(restorer, &frame->pretcode);
459 470
460 /* 471 /*
461 * This is movl $,%eax ; int $0x80 472 * This is movl $,%ax ; int $0x80
462 * 473 *
463 * WE DO NOT USE IT ANY MORE! It's only left here for historical 474 * WE DO NOT USE IT ANY MORE! It's only left here for historical
464 * reasons and because gdb uses it as a signature to notice 475 * reasons and because gdb uses it as a signature to notice
@@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
472 goto give_sigsegv; 483 goto give_sigsegv;
473 484
474 /* Set up registers for signal handler */ 485 /* Set up registers for signal handler */
475 regs->esp = (unsigned long) frame; 486 regs->sp = (unsigned long) frame;
476 regs->eip = (unsigned long) ka->sa.sa_handler; 487 regs->ip = (unsigned long) ka->sa.sa_handler;
477 regs->eax = (unsigned long) usig; 488 regs->ax = (unsigned long) usig;
478 regs->edx = (unsigned long) &frame->info; 489 regs->dx = (unsigned long) &frame->info;
479 regs->ecx = (unsigned long) &frame->uc; 490 regs->cx = (unsigned long) &frame->uc;
480 491
481 regs->xds = __USER_DS; 492 regs->ds = __USER_DS;
482 regs->xes = __USER_DS; 493 regs->es = __USER_DS;
483 regs->xss = __USER_DS; 494 regs->ss = __USER_DS;
484 regs->xcs = __USER_CS; 495 regs->cs = __USER_CS;
485 496
486 /* 497 /*
487 * Clear TF when entering the signal handler, but 498 * Clear TF when entering the signal handler, but
@@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
489 * The tracer may want to single-step inside the 500 * The tracer may want to single-step inside the
490 * handler too. 501 * handler too.
491 */ 502 */
492 regs->eflags &= ~TF_MASK; 503 regs->flags &= ~TF_MASK;
493 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
494 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
495 506
496#if DEBUG_SIG 507#if DEBUG_SIG
497 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 508 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
498 current->comm, current->pid, frame, regs->eip, frame->pretcode); 509 current->comm, current->pid, frame, regs->ip, frame->pretcode);
499#endif 510#endif
500 511
501 return 0; 512 return 0;
@@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
516 int ret; 527 int ret;
517 528
518 /* Are we from a system call? */ 529 /* Are we from a system call? */
519 if (regs->orig_eax >= 0) { 530 if (regs->orig_ax >= 0) {
520 /* If so, check system call restarting.. */ 531 /* If so, check system call restarting.. */
521 switch (regs->eax) { 532 switch (regs->ax) {
522 case -ERESTART_RESTARTBLOCK: 533 case -ERESTART_RESTARTBLOCK:
523 case -ERESTARTNOHAND: 534 case -ERESTARTNOHAND:
524 regs->eax = -EINTR; 535 regs->ax = -EINTR;
525 break; 536 break;
526 537
527 case -ERESTARTSYS: 538 case -ERESTARTSYS:
528 if (!(ka->sa.sa_flags & SA_RESTART)) { 539 if (!(ka->sa.sa_flags & SA_RESTART)) {
529 regs->eax = -EINTR; 540 regs->ax = -EINTR;
530 break; 541 break;
531 } 542 }
532 /* fallthrough */ 543 /* fallthrough */
533 case -ERESTARTNOINTR: 544 case -ERESTARTNOINTR:
534 regs->eax = regs->orig_eax; 545 regs->ax = regs->orig_ax;
535 regs->eip -= 2; 546 regs->ip -= 2;
536 } 547 }
537 } 548 }
538 549
539 /* 550 /*
540 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so 551 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
541 * that register information in the sigcontext is correct. 552 * flag so that register information in the sigcontext is correct.
542 */ 553 */
543 if (unlikely(regs->eflags & TF_MASK) 554 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
544 && likely(current->ptrace & PT_DTRACE)) { 555 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
545 current->ptrace &= ~PT_DTRACE; 556 regs->flags &= ~X86_EFLAGS_TF;
546 regs->eflags &= ~TF_MASK;
547 }
548 557
549 /* Set up the stack frame */ 558 /* Set up the stack frame */
550 if (ka->sa.sa_flags & SA_SIGINFO) 559 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
569 * want to handle. Thus you cannot kill init even with a SIGKILL even by 578 * want to handle. Thus you cannot kill init even with a SIGKILL even by
570 * mistake. 579 * mistake.
571 */ 580 */
572static void fastcall do_signal(struct pt_regs *regs) 581static void do_signal(struct pt_regs *regs)
573{ 582{
574 siginfo_t info; 583 siginfo_t info;
575 int signr; 584 int signr;
@@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs)
599 * have been cleared if the watchpoint triggered 608 * have been cleared if the watchpoint triggered
600 * inside the kernel. 609 * inside the kernel.
601 */ 610 */
602 if (unlikely(current->thread.debugreg[7])) 611 if (unlikely(current->thread.debugreg7))
603 set_debugreg(current->thread.debugreg[7], 7); 612 set_debugreg(current->thread.debugreg7, 7);
604 613
605 /* Whee! Actually deliver the signal. */ 614 /* Whee! Actually deliver the signal. */
606 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 615 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
@@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs)
616 } 625 }
617 626
618 /* Did we come from a system call? */ 627 /* Did we come from a system call? */
619 if (regs->orig_eax >= 0) { 628 if (regs->orig_ax >= 0) {
620 /* Restart the system call - no handlers present */ 629 /* Restart the system call - no handlers present */
621 switch (regs->eax) { 630 switch (regs->ax) {
622 case -ERESTARTNOHAND: 631 case -ERESTARTNOHAND:
623 case -ERESTARTSYS: 632 case -ERESTARTSYS:
624 case -ERESTARTNOINTR: 633 case -ERESTARTNOINTR:
625 regs->eax = regs->orig_eax; 634 regs->ax = regs->orig_ax;
626 regs->eip -= 2; 635 regs->ip -= 2;
627 break; 636 break;
628 637
629 case -ERESTART_RESTARTBLOCK: 638 case -ERESTART_RESTARTBLOCK:
630 regs->eax = __NR_restart_syscall; 639 regs->ax = __NR_restart_syscall;
631 regs->eip -= 2; 640 regs->ip -= 2;
632 break; 641 break;
633 } 642 }
634 } 643 }
@@ -651,7 +660,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
651{ 660{
652 /* Pending single-step? */ 661 /* Pending single-step? */
653 if (thread_info_flags & _TIF_SINGLESTEP) { 662 if (thread_info_flags & _TIF_SINGLESTEP) {
654 regs->eflags |= TF_MASK; 663 regs->flags |= TF_MASK;
655 clear_thread_flag(TIF_SINGLESTEP); 664 clear_thread_flag(TIF_SINGLESTEP);
656 } 665 }
657 666
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 38d806467c0f..7347bb14e306 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
40 struct pt_regs *regs) 40 struct pt_regs *regs)
41{ 41{
42 return do_sigaltstack(uss, uoss, regs->rsp); 42 return do_sigaltstack(uss, uoss, regs->sp);
43} 43}
44 44
45 45
@@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
64 64
65#define COPY(x) err |= __get_user(regs->x, &sc->x) 65#define COPY(x) err |= __get_user(regs->x, &sc->x)
66 66
67 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); 67 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
68 COPY(rdx); COPY(rcx); COPY(rip); 68 COPY(dx); COPY(cx); COPY(ip);
69 COPY(r8); 69 COPY(r8);
70 COPY(r9); 70 COPY(r9);
71 COPY(r10); 71 COPY(r10);
@@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
86 86
87 { 87 {
88 unsigned int tmpflags; 88 unsigned int tmpflags;
89 err |= __get_user(tmpflags, &sc->eflags); 89 err |= __get_user(tmpflags, &sc->flags);
90 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); 90 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
91 regs->orig_rax = -1; /* disable syscall checks */ 91 regs->orig_ax = -1; /* disable syscall checks */
92 } 92 }
93 93
94 { 94 {
@@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
108 } 108 }
109 } 109 }
110 110
111 err |= __get_user(*prax, &sc->rax); 111 err |= __get_user(*prax, &sc->ax);
112 return err; 112 return err;
113 113
114badframe: 114badframe:
@@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
119{ 119{
120 struct rt_sigframe __user *frame; 120 struct rt_sigframe __user *frame;
121 sigset_t set; 121 sigset_t set;
122 unsigned long eax; 122 unsigned long ax;
123 123
124 frame = (struct rt_sigframe __user *)(regs->rsp - 8); 124 frame = (struct rt_sigframe __user *)(regs->sp - 8);
125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { 125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
126 goto badframe; 126 goto badframe;
127 } 127 }
@@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
135 recalc_sigpending(); 135 recalc_sigpending();
136 spin_unlock_irq(&current->sighand->siglock); 136 spin_unlock_irq(&current->sighand->siglock);
137 137
138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
139 goto badframe; 139 goto badframe;
140 140
141#ifdef DEBUG_SIG 141#ifdef DEBUG_SIG
142 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); 142 printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
143#endif 143#endif
144 144
145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) 145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
146 goto badframe; 146 goto badframe;
147 147
148 return eax; 148 return ax;
149 149
150badframe: 150badframe:
151 signal_fault(regs,frame,"sigreturn"); 151 signal_fault(regs,frame,"sigreturn");
@@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
165 err |= __put_user(0, &sc->gs); 165 err |= __put_user(0, &sc->gs);
166 err |= __put_user(0, &sc->fs); 166 err |= __put_user(0, &sc->fs);
167 167
168 err |= __put_user(regs->rdi, &sc->rdi); 168 err |= __put_user(regs->di, &sc->di);
169 err |= __put_user(regs->rsi, &sc->rsi); 169 err |= __put_user(regs->si, &sc->si);
170 err |= __put_user(regs->rbp, &sc->rbp); 170 err |= __put_user(regs->bp, &sc->bp);
171 err |= __put_user(regs->rsp, &sc->rsp); 171 err |= __put_user(regs->sp, &sc->sp);
172 err |= __put_user(regs->rbx, &sc->rbx); 172 err |= __put_user(regs->bx, &sc->bx);
173 err |= __put_user(regs->rdx, &sc->rdx); 173 err |= __put_user(regs->dx, &sc->dx);
174 err |= __put_user(regs->rcx, &sc->rcx); 174 err |= __put_user(regs->cx, &sc->cx);
175 err |= __put_user(regs->rax, &sc->rax); 175 err |= __put_user(regs->ax, &sc->ax);
176 err |= __put_user(regs->r8, &sc->r8); 176 err |= __put_user(regs->r8, &sc->r8);
177 err |= __put_user(regs->r9, &sc->r9); 177 err |= __put_user(regs->r9, &sc->r9);
178 err |= __put_user(regs->r10, &sc->r10); 178 err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
183 err |= __put_user(regs->r15, &sc->r15); 183 err |= __put_user(regs->r15, &sc->r15);
184 err |= __put_user(me->thread.trap_no, &sc->trapno); 184 err |= __put_user(me->thread.trap_no, &sc->trapno);
185 err |= __put_user(me->thread.error_code, &sc->err); 185 err |= __put_user(me->thread.error_code, &sc->err);
186 err |= __put_user(regs->rip, &sc->rip); 186 err |= __put_user(regs->ip, &sc->ip);
187 err |= __put_user(regs->eflags, &sc->eflags); 187 err |= __put_user(regs->flags, &sc->flags);
188 err |= __put_user(mask, &sc->oldmask); 188 err |= __put_user(mask, &sc->oldmask);
189 err |= __put_user(me->thread.cr2, &sc->cr2); 189 err |= __put_user(me->thread.cr2, &sc->cr2);
190 190
@@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
198static void __user * 198static void __user *
199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) 199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
200{ 200{
201 unsigned long rsp; 201 unsigned long sp;
202 202
203 /* Default to using normal stack - redzone*/ 203 /* Default to using normal stack - redzone*/
204 rsp = regs->rsp - 128; 204 sp = regs->sp - 128;
205 205
206 /* This is the X/Open sanctioned signal stack switching. */ 206 /* This is the X/Open sanctioned signal stack switching. */
207 if (ka->sa.sa_flags & SA_ONSTACK) { 207 if (ka->sa.sa_flags & SA_ONSTACK) {
208 if (sas_ss_flags(rsp) == 0) 208 if (sas_ss_flags(sp) == 0)
209 rsp = current->sas_ss_sp + current->sas_ss_size; 209 sp = current->sas_ss_sp + current->sas_ss_size;
210 } 210 }
211 211
212 return (void __user *)round_down(rsp - size, 16); 212 return (void __user *)round_down(sp - size, 16);
213} 213}
214 214
215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
246 err |= __put_user(0, &frame->uc.uc_flags); 246 err |= __put_user(0, &frame->uc.uc_flags);
247 err |= __put_user(0, &frame->uc.uc_link); 247 err |= __put_user(0, &frame->uc.uc_link);
248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
249 err |= __put_user(sas_ss_flags(regs->rsp), 249 err |= __put_user(sas_ss_flags(regs->sp),
250 &frame->uc.uc_stack.ss_flags); 250 &frame->uc.uc_stack.ss_flags);
251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
271 goto give_sigsegv; 271 goto give_sigsegv;
272 272
273#ifdef DEBUG_SIG 273#ifdef DEBUG_SIG
274 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); 274 printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
275#endif 275#endif
276 276
277 /* Set up registers for signal handler */ 277 /* Set up registers for signal handler */
278 regs->rdi = sig; 278 regs->di = sig;
279 /* In case the signal handler was declared without prototypes */ 279 /* In case the signal handler was declared without prototypes */
280 regs->rax = 0; 280 regs->ax = 0;
281 281
282 /* This also works for non SA_SIGINFO handlers because they expect the 282 /* This also works for non SA_SIGINFO handlers because they expect the
283 next argument after the signal number on the stack. */ 283 next argument after the signal number on the stack. */
284 regs->rsi = (unsigned long)&frame->info; 284 regs->si = (unsigned long)&frame->info;
285 regs->rdx = (unsigned long)&frame->uc; 285 regs->dx = (unsigned long)&frame->uc;
286 regs->rip = (unsigned long) ka->sa.sa_handler; 286 regs->ip = (unsigned long) ka->sa.sa_handler;
287 287
288 regs->rsp = (unsigned long)frame; 288 regs->sp = (unsigned long)frame;
289 289
290 /* Set up the CS register to run signal handlers in 64-bit mode, 290 /* Set up the CS register to run signal handlers in 64-bit mode,
291 even if the handler happens to be interrupting 32-bit code. */ 291 even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
295 see include/asm-x86_64/uaccess.h for details. */ 295 see include/asm-x86_64/uaccess.h for details. */
296 set_fs(USER_DS); 296 set_fs(USER_DS);
297 297
298 regs->eflags &= ~TF_MASK; 298 regs->flags &= ~X86_EFLAGS_TF;
299 if (test_thread_flag(TIF_SINGLESTEP)) 299 if (test_thread_flag(TIF_SINGLESTEP))
300 ptrace_notify(SIGTRAP); 300 ptrace_notify(SIGTRAP);
301#ifdef DEBUG_SIG 301#ifdef DEBUG_SIG
302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", 302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
303 current->comm, current->pid, frame, regs->rip, frame->pretcode); 303 current->comm, current->pid, frame, regs->ip, frame->pretcode);
304#endif 304#endif
305 305
306 return 0; 306 return 0;
@@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
321 int ret; 321 int ret;
322 322
323#ifdef DEBUG_SIG 323#ifdef DEBUG_SIG
324 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", 324 printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
325 current->pid, sig, 325 current->pid, sig,
326 regs->rip, regs->rsp, regs); 326 regs->ip, regs->sp, regs);
327#endif 327#endif
328 328
329 /* Are we from a system call? */ 329 /* Are we from a system call? */
330 if ((long)regs->orig_rax >= 0) { 330 if ((long)regs->orig_ax >= 0) {
331 /* If so, check system call restarting.. */ 331 /* If so, check system call restarting.. */
332 switch (regs->rax) { 332 switch (regs->ax) {
333 case -ERESTART_RESTARTBLOCK: 333 case -ERESTART_RESTARTBLOCK:
334 case -ERESTARTNOHAND: 334 case -ERESTARTNOHAND:
335 regs->rax = -EINTR; 335 regs->ax = -EINTR;
336 break; 336 break;
337 337
338 case -ERESTARTSYS: 338 case -ERESTARTSYS:
339 if (!(ka->sa.sa_flags & SA_RESTART)) { 339 if (!(ka->sa.sa_flags & SA_RESTART)) {
340 regs->rax = -EINTR; 340 regs->ax = -EINTR;
341 break; 341 break;
342 } 342 }
343 /* fallthrough */ 343 /* fallthrough */
344 case -ERESTARTNOINTR: 344 case -ERESTARTNOINTR:
345 regs->rax = regs->orig_rax; 345 regs->ax = regs->orig_ax;
346 regs->rip -= 2; 346 regs->ip -= 2;
347 break; 347 break;
348 } 348 }
349 } 349 }
350 350
351 /* 351 /*
352 * If TF is set due to a debugger (PT_DTRACE), clear the TF 352 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
353 * flag so that register information in the sigcontext is 353 * flag so that register information in the sigcontext is correct.
354 * correct.
355 */ 354 */
356 if (unlikely(regs->eflags & TF_MASK)) { 355 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
357 if (likely(current->ptrace & PT_DTRACE)) { 356 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
358 current->ptrace &= ~PT_DTRACE; 357 regs->flags &= ~X86_EFLAGS_TF;
359 regs->eflags &= ~TF_MASK;
360 }
361 }
362 358
363#ifdef CONFIG_IA32_EMULATION 359#ifdef CONFIG_IA32_EMULATION
364 if (test_thread_flag(TIF_IA32)) { 360 if (test_thread_flag(TIF_IA32)) {
@@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs)
430 } 426 }
431 427
432 /* Did we come from a system call? */ 428 /* Did we come from a system call? */
433 if ((long)regs->orig_rax >= 0) { 429 if ((long)regs->orig_ax >= 0) {
434 /* Restart the system call - no handlers present */ 430 /* Restart the system call - no handlers present */
435 long res = regs->rax; 431 long res = regs->ax;
436 switch (res) { 432 switch (res) {
437 case -ERESTARTNOHAND: 433 case -ERESTARTNOHAND:
438 case -ERESTARTSYS: 434 case -ERESTARTSYS:
439 case -ERESTARTNOINTR: 435 case -ERESTARTNOINTR:
440 regs->rax = regs->orig_rax; 436 regs->ax = regs->orig_ax;
441 regs->rip -= 2; 437 regs->ip -= 2;
442 break; 438 break;
443 case -ERESTART_RESTARTBLOCK: 439 case -ERESTART_RESTARTBLOCK:
444 regs->rax = test_thread_flag(TIF_IA32) ? 440 regs->ax = test_thread_flag(TIF_IA32) ?
445 __NR_ia32_restart_syscall : 441 __NR_ia32_restart_syscall :
446 __NR_restart_syscall; 442 __NR_restart_syscall;
447 regs->rip -= 2; 443 regs->ip -= 2;
448 break; 444 break;
449 } 445 }
450 } 446 }
@@ -461,13 +457,13 @@ void
461do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 457do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
462{ 458{
463#ifdef DEBUG_SIG 459#ifdef DEBUG_SIG
464 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", 460 printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
465 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 461 thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
466#endif 462#endif
467 463
468 /* Pending single-step? */ 464 /* Pending single-step? */
469 if (thread_info_flags & _TIF_SINGLESTEP) { 465 if (thread_info_flags & _TIF_SINGLESTEP) {
470 regs->eflags |= TF_MASK; 466 regs->flags |= X86_EFLAGS_TF;
471 clear_thread_flag(TIF_SINGLESTEP); 467 clear_thread_flag(TIF_SINGLESTEP);
472 } 468 }
473 469
@@ -488,9 +484,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
488void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 484void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
489{ 485{
490 struct task_struct *me = current; 486 struct task_struct *me = current;
491 if (show_unhandled_signals && printk_ratelimit()) 487 if (show_unhandled_signals && printk_ratelimit()) {
492 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", 488 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
493 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 489 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
490 print_vma_addr(" in ", regs->ip);
491 printk("\n");
492 }
494 493
495 force_sig(SIGSEGV, me); 494 force_sig(SIGSEGV, me);
496} 495}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026eb807..dc0cde9d16fb 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
159 apic_write_around(APIC_ICR, cfg); 159 apic_write_around(APIC_ICR, cfg);
160} 160}
161 161
162void fastcall send_IPI_self(int vector) 162void send_IPI_self(int vector)
163{ 163{
164 __send_IPI_shortcut(APIC_DEST_SELF, vector); 164 __send_IPI_shortcut(APIC_DEST_SELF, vector);
165} 165}
@@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
223 */ 223 */
224 224
225 local_irq_save(flags); 225 local_irq_save(flags);
226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { 226 for_each_possible_cpu(query_cpu) {
227 if (cpu_isset(query_cpu, mask)) { 227 if (cpu_isset(query_cpu, mask)) {
228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
229 vector); 229 vector);
@@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
256 * We need to reload %cr3 since the page tables may be going 256 * We need to reload %cr3 since the page tables may be going
257 * away from under us.. 257 * away from under us..
258 */ 258 */
259void leave_mm(unsigned long cpu) 259void leave_mm(int cpu)
260{ 260{
261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
262 BUG(); 262 BUG();
263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); 263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
264 load_cr3(swapper_pg_dir); 264 load_cr3(swapper_pg_dir);
265} 265}
266EXPORT_SYMBOL_GPL(leave_mm);
266 267
267/* 268/*
268 * 269 *
@@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu)
310 * 2) Leave the mm if we are in the lazy tlb mode. 311 * 2) Leave the mm if we are in the lazy tlb mode.
311 */ 312 */
312 313
313fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 314void smp_invalidate_interrupt(struct pt_regs *regs)
314{ 315{
315 unsigned long cpu; 316 unsigned long cpu;
316 317
@@ -638,13 +639,13 @@ static void native_smp_send_stop(void)
638 * all the work is done automatically when 639 * all the work is done automatically when
639 * we return from the interrupt. 640 * we return from the interrupt.
640 */ 641 */
641fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 642void smp_reschedule_interrupt(struct pt_regs *regs)
642{ 643{
643 ack_APIC_irq(); 644 ack_APIC_irq();
644 __get_cpu_var(irq_stat).irq_resched_count++; 645 __get_cpu_var(irq_stat).irq_resched_count++;
645} 646}
646 647
647fastcall void smp_call_function_interrupt(struct pt_regs *regs) 648void smp_call_function_interrupt(struct pt_regs *regs)
648{ 649{
649 void (*func) (void *info) = call_data->func; 650 void (*func) (void *info) = call_data->func;
650 void *info = call_data->info; 651 void *info = call_data->info;
@@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id)
675{ 676{
676 int i; 677 int i;
677 678
678 for (i = 0; i < NR_CPUS; i++) { 679 for_each_possible_cpu(i) {
679 if (per_cpu(x86_cpu_to_apicid, i) == apic_id) 680 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
680 return i; 681 return i;
681 } 682 }
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 03fa6ed559c6..2fd74b06db67 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -29,7 +29,7 @@
29#include <asm/idle.h> 29#include <asm/idle.h>
30 30
31/* 31/*
32 * Smarter SMP flushing macros. 32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds. 33 * c/o Linus Torvalds.
34 * 34 *
35 * These mean you can really definitely utterly forget about 35 * These mean you can really definitely utterly forget about
@@ -37,15 +37,15 @@
37 * 37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com> 38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 * 39 *
40 * More scalable flush, from Andi Kleen 40 * More scalable flush, from Andi Kleen
41 * 41 *
42 * To avoid global state use 8 different call vectors. 42 * To avoid global state use 8 different call vectors.
43 * Each CPU uses a specific vector to trigger flushes on other 43 * Each CPU uses a specific vector to trigger flushes on other
44 * CPUs. Depending on the received vector the target CPUs look into 44 * CPUs. Depending on the received vector the target CPUs look into
45 * the right per cpu variable for the flush data. 45 * the right per cpu variable for the flush data.
46 * 46 *
47 * With more than 8 CPUs they are hashed to the 8 available 47 * With more than 8 CPUs they are hashed to the 8 available
48 * vectors. The limited global vector space forces us to this right now. 48 * vectors. The limited global vector space forces us to this right now.
49 * In future when interrupts are split into per CPU domains this could be 49 * In future when interrupts are split into per CPU domains this could be
50 * fixed, at the cost of triggering multiple IPIs in some cases. 50 * fixed, at the cost of triggering multiple IPIs in some cases.
51 */ 51 */
@@ -55,7 +55,6 @@ union smp_flush_state {
55 cpumask_t flush_cpumask; 55 cpumask_t flush_cpumask;
56 struct mm_struct *flush_mm; 56 struct mm_struct *flush_mm;
57 unsigned long flush_va; 57 unsigned long flush_va;
58#define FLUSH_ALL -1ULL
59 spinlock_t tlbstate_lock; 58 spinlock_t tlbstate_lock;
60 }; 59 };
61 char pad[SMP_CACHE_BYTES]; 60 char pad[SMP_CACHE_BYTES];
@@ -67,16 +66,17 @@ union smp_flush_state {
67static DEFINE_PER_CPU(union smp_flush_state, flush_state); 66static DEFINE_PER_CPU(union smp_flush_state, flush_state);
68 67
69/* 68/*
70 * We cannot call mmdrop() because we are in interrupt context, 69 * We cannot call mmdrop() because we are in interrupt context,
71 * instead update mm->cpu_vm_mask. 70 * instead update mm->cpu_vm_mask.
72 */ 71 */
73static inline void leave_mm(int cpu) 72void leave_mm(int cpu)
74{ 73{
75 if (read_pda(mmu_state) == TLBSTATE_OK) 74 if (read_pda(mmu_state) == TLBSTATE_OK)
76 BUG(); 75 BUG();
77 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 76 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
78 load_cr3(swapper_pg_dir); 77 load_cr3(swapper_pg_dir);
79} 78}
79EXPORT_SYMBOL_GPL(leave_mm);
80 80
81/* 81/*
82 * 82 *
@@ -85,25 +85,25 @@ static inline void leave_mm(int cpu)
85 * 1) switch_mm() either 1a) or 1b) 85 * 1) switch_mm() either 1a) or 1b)
86 * 1a) thread switch to a different mm 86 * 1a) thread switch to a different mm
87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
88 * Stop ipi delivery for the old mm. This is not synchronized with 88 * Stop ipi delivery for the old mm. This is not synchronized with
89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
90 * for the wrong mm, and in the worst case we perform a superfluous 90 * for the wrong mm, and in the worst case we perform a superfluous
91 * tlb flush. 91 * tlb flush.
92 * 1a2) set cpu mmu_state to TLBSTATE_OK 92 * 1a2) set cpu mmu_state to TLBSTATE_OK
93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
94 * was in lazy tlb mode. 94 * was in lazy tlb mode.
95 * 1a3) update cpu active_mm 95 * 1a3) update cpu active_mm
96 * Now cpu0 accepts tlb flushes for the new mm. 96 * Now cpu0 accepts tlb flushes for the new mm.
97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
98 * Now the other cpus will send tlb flush ipis. 98 * Now the other cpus will send tlb flush ipis.
99 * 1a4) change cr3. 99 * 1a4) change cr3.
100 * 1b) thread switch without mm change 100 * 1b) thread switch without mm change
101 * cpu active_mm is correct, cpu0 already handles 101 * cpu active_mm is correct, cpu0 already handles
102 * flush ipis. 102 * flush ipis.
103 * 1b1) set cpu mmu_state to TLBSTATE_OK 103 * 1b1) set cpu mmu_state to TLBSTATE_OK
104 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 104 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
105 * Atomically set the bit [other cpus will start sending flush ipis], 105 * Atomically set the bit [other cpus will start sending flush ipis],
106 * and test the bit. 106 * and test the bit.
107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb. 107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
108 * 2) switch %%esp, ie current 108 * 2) switch %%esp, ie current
109 * 109 *
@@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
137 * orig_rax contains the negated interrupt vector. 137 * orig_rax contains the negated interrupt vector.
138 * Use that to determine where the sender put the data. 138 * Use that to determine where the sender put the data.
139 */ 139 */
140 sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; 140 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
141 f = &per_cpu(flush_state, sender); 141 f = &per_cpu(flush_state, sender);
142 142
143 if (!cpu_isset(cpu, f->flush_cpumask)) 143 if (!cpu_isset(cpu, f->flush_cpumask))
144 goto out; 144 goto out;
145 /* 145 /*
146 * This was a BUG() but until someone can quote me the 146 * This was a BUG() but until someone can quote me the
147 * line from the intel manual that guarantees an IPI to 147 * line from the intel manual that guarantees an IPI to
148 * multiple CPUs is retried _only_ on the erroring CPUs 148 * multiple CPUs is retried _only_ on the erroring CPUs
@@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
150 * 150 *
151 * BUG(); 151 * BUG();
152 */ 152 */
153 153
154 if (f->flush_mm == read_pda(active_mm)) { 154 if (f->flush_mm == read_pda(active_mm)) {
155 if (read_pda(mmu_state) == TLBSTATE_OK) { 155 if (read_pda(mmu_state) == TLBSTATE_OK) {
156 if (f->flush_va == FLUSH_ALL) 156 if (f->flush_va == TLB_FLUSH_ALL)
157 local_flush_tlb(); 157 local_flush_tlb();
158 else 158 else
159 __flush_tlb_one(f->flush_va); 159 __flush_tlb_one(f->flush_va);
@@ -166,19 +166,22 @@ out:
166 add_pda(irq_tlb_count, 1); 166 add_pda(irq_tlb_count, 1);
167} 167}
168 168
169static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 169void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
170 unsigned long va) 170 unsigned long va)
171{ 171{
172 int sender; 172 int sender;
173 union smp_flush_state *f; 173 union smp_flush_state *f;
174 cpumask_t cpumask = *cpumaskp;
174 175
175 /* Caller has disabled preemption */ 176 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 177 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
177 f = &per_cpu(flush_state, sender); 178 f = &per_cpu(flush_state, sender);
178 179
179 /* Could avoid this lock when 180 /*
180 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * Could avoid this lock when
181 probably not worth checking this for a cache-hot lock. */ 182 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
183 * probably not worth checking this for a cache-hot lock.
184 */
182 spin_lock(&f->tlbstate_lock); 185 spin_lock(&f->tlbstate_lock);
183 186
184 f->flush_mm = mm; 187 f->flush_mm = mm;
@@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
202int __cpuinit init_smp_flush(void) 205int __cpuinit init_smp_flush(void)
203{ 206{
204 int i; 207 int i;
208
205 for_each_cpu_mask(i, cpu_possible_map) { 209 for_each_cpu_mask(i, cpu_possible_map) {
206 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 210 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
207 } 211 }
208 return 0; 212 return 0;
209} 213}
210
211core_initcall(init_smp_flush); 214core_initcall(init_smp_flush);
212 215
213void flush_tlb_current_task(void) 216void flush_tlb_current_task(void)
214{ 217{
215 struct mm_struct *mm = current->mm; 218 struct mm_struct *mm = current->mm;
@@ -221,10 +224,9 @@ void flush_tlb_current_task(void)
221 224
222 local_flush_tlb(); 225 local_flush_tlb();
223 if (!cpus_empty(cpu_mask)) 226 if (!cpus_empty(cpu_mask))
224 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 227 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
225 preempt_enable(); 228 preempt_enable();
226} 229}
227EXPORT_SYMBOL(flush_tlb_current_task);
228 230
229void flush_tlb_mm (struct mm_struct * mm) 231void flush_tlb_mm (struct mm_struct * mm)
230{ 232{
@@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm)
241 leave_mm(smp_processor_id()); 243 leave_mm(smp_processor_id());
242 } 244 }
243 if (!cpus_empty(cpu_mask)) 245 if (!cpus_empty(cpu_mask))
244 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 246 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
245 247
246 preempt_enable(); 248 preempt_enable();
247} 249}
248EXPORT_SYMBOL(flush_tlb_mm);
249 250
250void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) 251void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
251{ 252{
@@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
259 if (current->active_mm == mm) { 260 if (current->active_mm == mm) {
260 if(current->mm) 261 if(current->mm)
261 __flush_tlb_one(va); 262 __flush_tlb_one(va);
262 else 263 else
263 leave_mm(smp_processor_id()); 264 leave_mm(smp_processor_id());
264 } 265 }
265 266
266 if (!cpus_empty(cpu_mask)) 267 if (!cpus_empty(cpu_mask))
@@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
268 269
269 preempt_enable(); 270 preempt_enable();
270} 271}
271EXPORT_SYMBOL(flush_tlb_page);
272 272
273static void do_flush_tlb_all(void* info) 273static void do_flush_tlb_all(void* info)
274{ 274{
@@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void)
325 * this function sends a 'generic call function' IPI to all other CPU 325 * this function sends a 'generic call function' IPI to all other CPU
326 * of the system defined in the mask. 326 * of the system defined in the mask.
327 */ 327 */
328 328static int __smp_call_function_mask(cpumask_t mask,
329static int 329 void (*func)(void *), void *info,
330__smp_call_function_mask(cpumask_t mask, 330 int wait)
331 void (*func)(void *), void *info,
332 int wait)
333{ 331{
334 struct call_data_struct data; 332 struct call_data_struct data;
335 cpumask_t allbutself; 333 cpumask_t allbutself;
@@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
417 */ 415 */
418 416
419int smp_call_function_single (int cpu, void (*func) (void *info), void *info, 417int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
420 int nonatomic, int wait) 418 int nonatomic, int wait)
421{ 419{
422 /* prevent preemption and reschedule on another processor */ 420 /* prevent preemption and reschedule on another processor */
423 int ret; 421 int ret, me = get_cpu();
424 int me = get_cpu();
425 422
426 /* Can deadlock when called with interrupts disabled */ 423 /* Can deadlock when called with interrupts disabled */
427 WARN_ON(irqs_disabled()); 424 WARN_ON(irqs_disabled());
@@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy)
471 */ 468 */
472 cpu_clear(smp_processor_id(), cpu_online_map); 469 cpu_clear(smp_processor_id(), cpu_online_map);
473 disable_local_APIC(); 470 disable_local_APIC();
474 for (;;) 471 for (;;)
475 halt(); 472 halt();
476} 473}
477 474
478void smp_send_stop(void) 475void smp_send_stop(void)
479{ 476{
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 4ea80cbe52e5..5787a0c3e296 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map);
83 83
84cpumask_t cpu_callin_map; 84cpumask_t cpu_callin_map;
85cpumask_t cpu_callout_map; 85cpumask_t cpu_callout_map;
86EXPORT_SYMBOL(cpu_callout_map);
87cpumask_t cpu_possible_map; 86cpumask_t cpu_possible_map;
88EXPORT_SYMBOL(cpu_possible_map); 87EXPORT_SYMBOL(cpu_possible_map);
89static cpumask_t smp_commenced_mask; 88static cpumask_t smp_commenced_mask;
@@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask;
92DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 91DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
93EXPORT_PER_CPU_SYMBOL(cpu_info); 92EXPORT_PER_CPU_SYMBOL(cpu_info);
94 93
95/* 94/* which logical CPU number maps to which CPU (physical APIC ID) */
96 * The following static array is used during kernel startup
97 * and the x86_cpu_to_apicid_ptr contains the address of the
98 * array during this time. Is it zeroed when the per_cpu
99 * data area is removed.
100 */
101u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = 95u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
102 { [0 ... NR_CPUS-1] = BAD_APICID }; 96 { [0 ... NR_CPUS-1] = BAD_APICID };
103void *x86_cpu_to_apicid_ptr; 97void *x86_cpu_to_apicid_early_ptr;
104DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 98DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
105EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 99EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
106 100
@@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID];
113extern const unsigned char trampoline_data []; 107extern const unsigned char trampoline_data [];
114extern const unsigned char trampoline_end []; 108extern const unsigned char trampoline_end [];
115static unsigned char *trampoline_base; 109static unsigned char *trampoline_base;
116static int trampoline_exec;
117 110
118static void map_cpu_to_logical_apicid(void); 111static void map_cpu_to_logical_apicid(void);
119 112
@@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void)
138 */ 131 */
139void __init smp_alloc_memory(void) 132void __init smp_alloc_memory(void)
140{ 133{
141 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); 134 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
142 /* 135 /*
143 * Has to be in very low memory so we can execute 136 * Has to be in very low memory so we can execute
144 * real-mode AP code. 137 * real-mode AP code.
145 */ 138 */
146 if (__pa(trampoline_base) >= 0x9F000) 139 if (__pa(trampoline_base) >= 0x9F000)
147 BUG(); 140 BUG();
148 /*
149 * Make the SMP trampoline executable:
150 */
151 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
152} 141}
153 142
154/* 143/*
@@ -405,7 +394,7 @@ static void __cpuinit start_secondary(void *unused)
405 setup_secondary_clock(); 394 setup_secondary_clock();
406 if (nmi_watchdog == NMI_IO_APIC) { 395 if (nmi_watchdog == NMI_IO_APIC) {
407 disable_8259A_irq(0); 396 disable_8259A_irq(0);
408 enable_NMI_through_LVT0(NULL); 397 enable_NMI_through_LVT0();
409 enable_8259A_irq(0); 398 enable_8259A_irq(0);
410 } 399 }
411 /* 400 /*
@@ -448,38 +437,38 @@ void __devinit initialize_secondary(void)
448{ 437{
449 /* 438 /*
450 * We don't actually need to load the full TSS, 439 * We don't actually need to load the full TSS,
451 * basically just the stack pointer and the eip. 440 * basically just the stack pointer and the ip.
452 */ 441 */
453 442
454 asm volatile( 443 asm volatile(
455 "movl %0,%%esp\n\t" 444 "movl %0,%%esp\n\t"
456 "jmp *%1" 445 "jmp *%1"
457 : 446 :
458 :"m" (current->thread.esp),"m" (current->thread.eip)); 447 :"m" (current->thread.sp),"m" (current->thread.ip));
459} 448}
460 449
461/* Static state in head.S used to set up a CPU */ 450/* Static state in head.S used to set up a CPU */
462extern struct { 451extern struct {
463 void * esp; 452 void * sp;
464 unsigned short ss; 453 unsigned short ss;
465} stack_start; 454} stack_start;
466 455
467#ifdef CONFIG_NUMA 456#ifdef CONFIG_NUMA
468 457
469/* which logical CPUs are on which nodes */ 458/* which logical CPUs are on which nodes */
470cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = 459cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
471 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; 460 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
472EXPORT_SYMBOL(node_2_cpu_mask); 461EXPORT_SYMBOL(node_to_cpumask_map);
473/* which node each logical CPU is on */ 462/* which node each logical CPU is on */
474int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 463int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
475EXPORT_SYMBOL(cpu_2_node); 464EXPORT_SYMBOL(cpu_to_node_map);
476 465
477/* set up a mapping between cpu and node. */ 466/* set up a mapping between cpu and node. */
478static inline void map_cpu_to_node(int cpu, int node) 467static inline void map_cpu_to_node(int cpu, int node)
479{ 468{
480 printk("Mapping cpu %d to node %d\n", cpu, node); 469 printk("Mapping cpu %d to node %d\n", cpu, node);
481 cpu_set(cpu, node_2_cpu_mask[node]); 470 cpu_set(cpu, node_to_cpumask_map[node]);
482 cpu_2_node[cpu] = node; 471 cpu_to_node_map[cpu] = node;
483} 472}
484 473
485/* undo a mapping between cpu and node. */ 474/* undo a mapping between cpu and node. */
@@ -489,8 +478,8 @@ static inline void unmap_cpu_to_node(int cpu)
489 478
490 printk("Unmapping cpu %d from all nodes\n", cpu); 479 printk("Unmapping cpu %d from all nodes\n", cpu);
491 for (node = 0; node < MAX_NUMNODES; node ++) 480 for (node = 0; node < MAX_NUMNODES; node ++)
492 cpu_clear(cpu, node_2_cpu_mask[node]); 481 cpu_clear(cpu, node_to_cpumask_map[node]);
493 cpu_2_node[cpu] = 0; 482 cpu_to_node_map[cpu] = 0;
494} 483}
495#else /* !CONFIG_NUMA */ 484#else /* !CONFIG_NUMA */
496 485
@@ -668,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
668 * target processor state. 657 * target processor state.
669 */ 658 */
670 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 659 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
671 (unsigned long) stack_start.esp); 660 (unsigned long) stack_start.sp);
672 661
673 /* 662 /*
674 * Run STARTUP IPI loop. 663 * Run STARTUP IPI loop.
@@ -754,7 +743,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
754 /* initialize thread_struct. we really want to avoid destroy 743 /* initialize thread_struct. we really want to avoid destroy
755 * idle tread 744 * idle tread
756 */ 745 */
757 idle->thread.esp = (unsigned long)task_pt_regs(idle); 746 idle->thread.sp = (unsigned long)task_pt_regs(idle);
758 init_idle(idle, cpu); 747 init_idle(idle, cpu);
759 return idle; 748 return idle;
760 } 749 }
@@ -799,7 +788,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
799 per_cpu(current_task, cpu) = idle; 788 per_cpu(current_task, cpu) = idle;
800 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 789 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
801 790
802 idle->thread.eip = (unsigned long) start_secondary; 791 idle->thread.ip = (unsigned long) start_secondary;
803 /* start_eip had better be page-aligned! */ 792 /* start_eip had better be page-aligned! */
804 start_eip = setup_trampoline(); 793 start_eip = setup_trampoline();
805 794
@@ -807,9 +796,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
807 alternatives_smp_switch(1); 796 alternatives_smp_switch(1);
808 797
809 /* So we see what's up */ 798 /* So we see what's up */
810 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 799 printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
811 /* Stack for startup_32 can be just as for start_secondary onwards */ 800 /* Stack for startup_32 can be just as for start_secondary onwards */
812 stack_start.esp = (void *) idle->thread.esp; 801 stack_start.sp = (void *) idle->thread.sp;
813 802
814 irq_ctx_init(cpu); 803 irq_ctx_init(cpu);
815 804
@@ -1091,7 +1080,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1091 * Allow the user to impress friends. 1080 * Allow the user to impress friends.
1092 */ 1081 */
1093 Dprintk("Before bogomips.\n"); 1082 Dprintk("Before bogomips.\n");
1094 for (cpu = 0; cpu < NR_CPUS; cpu++) 1083 for_each_possible_cpu(cpu)
1095 if (cpu_isset(cpu, cpu_callout_map)) 1084 if (cpu_isset(cpu, cpu_callout_map))
1096 bogosum += cpu_data(cpu).loops_per_jiffy; 1085 bogosum += cpu_data(cpu).loops_per_jiffy;
1097 printk(KERN_INFO 1086 printk(KERN_INFO
@@ -1122,7 +1111,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1122 * construct cpu_sibling_map, so that we can tell sibling CPUs 1111 * construct cpu_sibling_map, so that we can tell sibling CPUs
1123 * efficiently. 1112 * efficiently.
1124 */ 1113 */
1125 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1114 for_each_possible_cpu(cpu) {
1126 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1115 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1127 cpus_clear(per_cpu(cpu_core_map, cpu)); 1116 cpus_clear(per_cpu(cpu_core_map, cpu));
1128 } 1117 }
@@ -1296,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1296 setup_ioapic_dest(); 1285 setup_ioapic_dest();
1297#endif 1286#endif
1298 zap_low_mappings(); 1287 zap_low_mappings();
1299#ifndef CONFIG_HOTPLUG_CPU
1300 /*
1301 * Disable executability of the SMP trampoline:
1302 */
1303 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1304#endif
1305} 1288}
1306 1289
1307void __init smp_intr_init(void) 1290void __init smp_intr_init(void)
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index aaf4e1291217..cc64b8085c2a 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
65EXPORT_SYMBOL(smp_num_siblings); 65EXPORT_SYMBOL(smp_num_siblings);
66 66
67/* Last level cache ID of each logical CPU */ 67/* Last level cache ID of each logical CPU */
68DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; 68DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
69 69
70/* Bitmask of currently online CPUs */ 70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map __read_mostly; 71cpumask_t cpu_online_map __read_mostly;
@@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map);
78 */ 78 */
79cpumask_t cpu_callin_map; 79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map; 80cpumask_t cpu_callout_map;
81EXPORT_SYMBOL(cpu_callout_map);
82
83cpumask_t cpu_possible_map; 81cpumask_t cpu_possible_map;
84EXPORT_SYMBOL(cpu_possible_map); 82EXPORT_SYMBOL(cpu_possible_map);
85 83
@@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
113 * a new thread. Also avoids complicated thread destroy functionality 111 * a new thread. Also avoids complicated thread destroy functionality
114 * for idle threads. 112 * for idle threads.
115 */ 113 */
114#ifdef CONFIG_HOTPLUG_CPU
115/*
116 * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
117 * removed after init for !CONFIG_HOTPLUG_CPU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
120#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
121#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p))
122#else
116struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 123struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
117
118#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 124#define get_idle_for_cpu(x) (idle_thread_array[(x)])
119#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) 125#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
126#endif
127
120 128
121/* 129/*
122 * Currently trivial. Write the real->protected mode 130 * Currently trivial. Write the real->protected mode
@@ -212,6 +220,7 @@ void __cpuinit smp_callin(void)
212 220
213 Dprintk("CALLIN, before setup_local_APIC().\n"); 221 Dprintk("CALLIN, before setup_local_APIC().\n");
214 setup_local_APIC(); 222 setup_local_APIC();
223 end_local_APIC_setup();
215 224
216 /* 225 /*
217 * Get our bogomips. 226 * Get our bogomips.
@@ -338,7 +347,7 @@ void __cpuinit start_secondary(void)
338 347
339 if (nmi_watchdog == NMI_IO_APIC) { 348 if (nmi_watchdog == NMI_IO_APIC) {
340 disable_8259A_irq(0); 349 disable_8259A_irq(0);
341 enable_NMI_through_LVT0(NULL); 350 enable_NMI_through_LVT0();
342 enable_8259A_irq(0); 351 enable_8259A_irq(0);
343 } 352 }
344 353
@@ -370,7 +379,7 @@ void __cpuinit start_secondary(void)
370 379
371 unlock_ipi_call_lock(); 380 unlock_ipi_call_lock();
372 381
373 setup_secondary_APIC_clock(); 382 setup_secondary_clock();
374 383
375 cpu_idle(); 384 cpu_idle();
376} 385}
@@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid)
384 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 393 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
385 char *names[] = { "ID", "VERSION", "SPIV" }; 394 char *names[] = { "ID", "VERSION", "SPIV" };
386 int timeout; 395 int timeout;
387 unsigned int status; 396 u32 status;
388 397
389 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 398 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
390 399
391 for (i = 0; i < ARRAY_SIZE(regs); i++) { 400 for (i = 0; i < ARRAY_SIZE(regs); i++) {
392 printk("... APIC #%d %s: ", apicid, names[i]); 401 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
393 402
394 /* 403 /*
395 * Wait for idle. 404 * Wait for idle.
396 */ 405 */
397 status = safe_apic_wait_icr_idle(); 406 status = safe_apic_wait_icr_idle();
398 if (status) 407 if (status)
399 printk("a previous APIC delivery may have failed\n"); 408 printk(KERN_CONT
409 "a previous APIC delivery may have failed\n");
400 410
401 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 411 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
402 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); 412 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid)
410 switch (status) { 420 switch (status) {
411 case APIC_ICR_RR_VALID: 421 case APIC_ICR_RR_VALID:
412 status = apic_read(APIC_RRR); 422 status = apic_read(APIC_RRR);
413 printk("%08x\n", status); 423 printk(KERN_CONT "%08x\n", status);
414 break; 424 break;
415 default: 425 default:
416 printk("failed\n"); 426 printk(KERN_CONT "failed\n");
417 } 427 }
418 } 428 }
419} 429}
@@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
466 */ 476 */
467 Dprintk("#startup loops: %d.\n", num_starts); 477 Dprintk("#startup loops: %d.\n", num_starts);
468 478
469 maxlvt = get_maxlvt(); 479 maxlvt = lapic_get_maxlvt();
470 480
471 for (j = 1; j <= num_starts; j++) { 481 for (j = 1; j <= num_starts; j++) {
472 Dprintk("Sending STARTUP #%d.\n",j); 482 Dprintk("Sending STARTUP #%d.\n",j);
@@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
577 c_idle.idle = get_idle_for_cpu(cpu); 587 c_idle.idle = get_idle_for_cpu(cpu);
578 588
579 if (c_idle.idle) { 589 if (c_idle.idle) {
580 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) 590 c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
581 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); 591 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
582 init_idle(c_idle.idle, cpu); 592 init_idle(c_idle.idle, cpu);
583 goto do_rest; 593 goto do_rest;
@@ -613,8 +623,8 @@ do_rest:
613 623
614 start_rip = setup_trampoline(); 624 start_rip = setup_trampoline();
615 625
616 init_rsp = c_idle.idle->thread.rsp; 626 init_rsp = c_idle.idle->thread.sp;
617 per_cpu(init_tss,cpu).rsp0 = init_rsp; 627 load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
618 initial_code = start_secondary; 628 initial_code = start_secondary;
619 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 629 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
620 630
@@ -691,7 +701,7 @@ do_rest:
691 } 701 }
692 if (boot_error) { 702 if (boot_error) {
693 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 703 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
694 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 704 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
695 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 705 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
696 cpu_clear(cpu, cpu_present_map); 706 cpu_clear(cpu, cpu_present_map);
697 cpu_clear(cpu, cpu_possible_map); 707 cpu_clear(cpu, cpu_possible_map);
@@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus)
841 return 0; 851 return 0;
842} 852}
843 853
844/* 854static void __init smp_cpu_index_default(void)
845 * Copy apicid's found by MP_processor_info from initial array to the per cpu
846 * data area. The x86_cpu_to_apicid_init array is then expendable and the
847 * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
848 * longer available.
849 */
850void __init smp_set_apicids(void)
851{ 855{
852 int cpu; 856 int i;
857 struct cpuinfo_x86 *c;
853 858
854 for_each_cpu_mask(cpu, cpu_possible_map) { 859 for_each_cpu_mask(i, cpu_possible_map) {
855 if (per_cpu_offset(cpu)) 860 c = &cpu_data(i);
856 per_cpu(x86_cpu_to_apicid, cpu) = 861 /* mark all to hotplug */
857 x86_cpu_to_apicid_init[cpu]; 862 c->cpu_index = NR_CPUS;
858 } 863 }
859
860 /* indicate the static array will be going away soon */
861 x86_cpu_to_apicid_ptr = NULL;
862} 864}
863 865
864/* 866/*
@@ -868,9 +870,9 @@ void __init smp_set_apicids(void)
868void __init smp_prepare_cpus(unsigned int max_cpus) 870void __init smp_prepare_cpus(unsigned int max_cpus)
869{ 871{
870 nmi_watchdog_default(); 872 nmi_watchdog_default();
873 smp_cpu_index_default();
871 current_cpu_data = boot_cpu_data; 874 current_cpu_data = boot_cpu_data;
872 current_thread_info()->cpu = 0; /* needed? */ 875 current_thread_info()->cpu = 0; /* needed? */
873 smp_set_apicids();
874 set_cpu_sibling_map(0); 876 set_cpu_sibling_map(0);
875 877
876 if (smp_sanity_check(max_cpus) < 0) { 878 if (smp_sanity_check(max_cpus) < 0) {
@@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
885 */ 887 */
886 setup_local_APIC(); 888 setup_local_APIC();
887 889
890 /*
891 * Enable IO APIC before setting up error vector
892 */
893 if (!skip_ioapic_setup && nr_ioapics)
894 enable_IO_APIC();
895 end_local_APIC_setup();
896
888 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 897 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
889 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 898 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
890 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 899 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
@@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
903 * Set up local APIC timer on boot CPU. 912 * Set up local APIC timer on boot CPU.
904 */ 913 */
905 914
906 setup_boot_APIC_clock(); 915 setup_boot_clock();
907} 916}
908 917
909/* 918/*
@@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
912void __init smp_prepare_boot_cpu(void) 921void __init smp_prepare_boot_cpu(void)
913{ 922{
914 int me = smp_processor_id(); 923 int me = smp_processor_id();
915 cpu_set(me, cpu_online_map); 924 /* already set me in cpu_online_map in boot_cpu_init() */
916 cpu_set(me, cpu_callout_map); 925 cpu_set(me, cpu_callout_map);
917 per_cpu(cpu_state, me) = CPU_ONLINE; 926 per_cpu(cpu_state, me) = CPU_ONLINE;
918} 927}
@@ -1016,7 +1025,7 @@ void remove_cpu_from_maps(void)
1016 1025
1017 cpu_clear(cpu, cpu_callout_map); 1026 cpu_clear(cpu, cpu_callout_map);
1018 cpu_clear(cpu, cpu_callin_map); 1027 cpu_clear(cpu, cpu_callin_map);
1019 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 1028 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
1020 clear_node_cpumask(cpu); 1029 clear_node_cpumask(cpu);
1021} 1030}
1022 1031
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
index bbfe85a0f699..8bc38af29aef 100644
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu)
14{ 14{
15 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 15 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
16 16
17 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, 17 pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
18 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
19 __per_cpu_offset[cpu], 0xFFFFF, 18 __per_cpu_offset[cpu], 0xFFFFF,
20 0x80 | DESCTYPE_S | 0x2, 0x8); 19 0x2 | DESCTYPE_S, 0x8);
20
21 gdt[GDT_ENTRY_PERCPU].s = 1;
21 22
22 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 23 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
23 per_cpu(cpu_number, cpu) = cpu; 24 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 2a8713ec0f9a..2bf6903cb444 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
57static int num_memory_chunks; /* total number of memory chunks */ 57static int num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID]; 58static u8 __initdata apicid_to_pxm[MAX_APICID];
59 59
60extern void * boot_ioremap(unsigned long, unsigned long);
61
62/* Identify CPU proximity domains */ 60/* Identify CPU proximity domains */
63static void __init parse_cpu_affinity_structure(char *p) 61static void __init parse_cpu_affinity_structure(char *p)
64{ 62{
@@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void)
299 } 297 }
300 298
301 rsdt = (struct acpi_table_rsdt *) 299 rsdt = (struct acpi_table_rsdt *)
302 boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); 300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
303 301
304 if (!rsdt) { 302 if (!rsdt) {
305 printk(KERN_WARNING 303 printk(KERN_WARNING
@@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void)
339 for (i = 0; i < tables; i++) { 337 for (i = 0; i < tables; i++) {
340 /* Map in header, then map in full table length. */ 338 /* Map in header, then map in full table length. */
341 header = (struct acpi_table_header *) 339 header = (struct acpi_table_header *)
342 boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); 340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
343 if (!header) 341 if (!header)
344 break; 342 break;
345 header = (struct acpi_table_header *) 343 header = (struct acpi_table_header *)
346 boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); 344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
347 if (!header) 345 if (!header)
348 break; 346 break;
349 347
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55771fd7e545..02f0f61f5b11 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -22,9 +22,9 @@ static int save_stack_stack(void *data, char *name)
22 return -1; 22 return -1;
23} 23}
24 24
25static void save_stack_address(void *data, unsigned long addr) 25static void save_stack_address(void *data, unsigned long addr, int reliable)
26{ 26{
27 struct stack_trace *trace = (struct stack_trace *)data; 27 struct stack_trace *trace = data;
28 if (trace->skip > 0) { 28 if (trace->skip > 0) {
29 trace->skip--; 29 trace->skip--;
30 return; 30 return;
@@ -33,7 +33,8 @@ static void save_stack_address(void *data, unsigned long addr)
33 trace->entries[trace->nr_entries++] = addr; 33 trace->entries[trace->nr_entries++] = addr;
34} 34}
35 35
36static void save_stack_address_nosched(void *data, unsigned long addr) 36static void
37save_stack_address_nosched(void *data, unsigned long addr, int reliable)
37{ 38{
38 struct stack_trace *trace = (struct stack_trace *)data; 39 struct stack_trace *trace = (struct stack_trace *)data;
39 if (in_sched_functions(addr)) 40 if (in_sched_functions(addr))
@@ -65,15 +66,14 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
65 */ 66 */
66void save_stack_trace(struct stack_trace *trace) 67void save_stack_trace(struct stack_trace *trace)
67{ 68{
68 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 69 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
69 if (trace->nr_entries < trace->max_entries) 70 if (trace->nr_entries < trace->max_entries)
70 trace->entries[trace->nr_entries++] = ULONG_MAX; 71 trace->entries[trace->nr_entries++] = ULONG_MAX;
71} 72}
72EXPORT_SYMBOL(save_stack_trace);
73 73
74void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 74void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
75{ 75{
76 dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 76 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
77 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
78 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
79} 79}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 000000000000..2ef1a5f8d675
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,203 @@
1/*
2 * x86 single-step support code, common to 32-bit and 64-bit.
3 */
4#include <linux/sched.h>
5#include <linux/mm.h>
6#include <linux/ptrace.h>
7
8unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
9{
10 unsigned long addr, seg;
11
12 addr = regs->ip;
13 seg = regs->cs & 0xffff;
14 if (v8086_mode(regs)) {
15 addr = (addr & 0xffff) + (seg << 4);
16 return addr;
17 }
18
19 /*
20 * We'll assume that the code segments in the GDT
21 * are all zero-based. That is largely true: the
22 * TLS segments are used for data, and the PNPBIOS
23 * and APM bios ones we just ignore here.
24 */
25 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
26 u32 *desc;
27 unsigned long base;
28
29 seg &= ~7UL;
30
31 mutex_lock(&child->mm->context.lock);
32 if (unlikely((seg >> 3) >= child->mm->context.size))
33 addr = -1L; /* bogus selector, access would fault */
34 else {
35 desc = child->mm->context.ldt + seg;
36 base = ((desc[0] >> 16) |
37 ((desc[1] & 0xff) << 16) |
38 (desc[1] & 0xff000000));
39
40 /* 16-bit code segment? */
41 if (!((desc[1] >> 22) & 1))
42 addr &= 0xffff;
43 addr += base;
44 }
45 mutex_unlock(&child->mm->context.lock);
46 }
47
48 return addr;
49}
50
51static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
52{
53 int i, copied;
54 unsigned char opcode[15];
55 unsigned long addr = convert_ip_to_linear(child, regs);
56
57 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
58 for (i = 0; i < copied; i++) {
59 switch (opcode[i]) {
60 /* popf and iret */
61 case 0x9d: case 0xcf:
62 return 1;
63
64 /* CHECKME: 64 65 */
65
66 /* opcode and address size prefixes */
67 case 0x66: case 0x67:
68 continue;
69 /* irrelevant prefixes (segment overrides and repeats) */
70 case 0x26: case 0x2e:
71 case 0x36: case 0x3e:
72 case 0x64: case 0x65:
73 case 0xf0: case 0xf2: case 0xf3:
74 continue;
75
76#ifdef CONFIG_X86_64
77 case 0x40 ... 0x4f:
78 if (regs->cs != __USER_CS)
79 /* 32-bit mode: register increment */
80 return 0;
81 /* 64-bit mode: REX prefix */
82 continue;
83#endif
84
85 /* CHECKME: f2, f3 */
86
87 /*
88 * pushf: NOTE! We should probably not let
89 * the user see the TF bit being set. But
90 * it's more pain than it's worth to avoid
91 * it, and a debugger could emulate this
92 * all in user space if it _really_ cares.
93 */
94 case 0x9c:
95 default:
96 return 0;
97 }
98 }
99 return 0;
100}
101
102/*
103 * Enable single-stepping. Return nonzero if user mode is not using TF itself.
104 */
105static int enable_single_step(struct task_struct *child)
106{
107 struct pt_regs *regs = task_pt_regs(child);
108
109 /*
110 * Always set TIF_SINGLESTEP - this guarantees that
111 * we single-step system calls etc.. This will also
112 * cause us to set TF when returning to user mode.
113 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115
116 /*
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121
122 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF;
124
125 /*
126 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later.
129 */
130 if (is_setting_trap_flag(child, regs))
131 return 0;
132
133 set_tsk_thread_flag(child, TIF_FORCED_TF);
134
135 return 1;
136}
137
138/*
139 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
140 */
141static void write_debugctlmsr(struct task_struct *child, unsigned long val)
142{
143 child->thread.debugctlmsr = val;
144
145 if (child != current)
146 return;
147
148 wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
149}
150
151/*
152 * Enable single or block step.
153 */
154static void enable_step(struct task_struct *child, bool block)
155{
156 /*
157 * Make sure block stepping (BTF) is not enabled unless it should be.
158 * Note that we don't try to worry about any is_setting_trap_flag()
159 * instructions after the first when using block stepping.
160 * So noone should try to use debugger block stepping in a program
161 * that uses user-mode single stepping itself.
162 */
163 if (enable_single_step(child) && block) {
164 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
165 write_debugctlmsr(child,
166 child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
167 } else {
168 write_debugctlmsr(child,
169 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
170
171 if (!child->thread.debugctlmsr)
172 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
173 }
174}
175
176void user_enable_single_step(struct task_struct *child)
177{
178 enable_step(child, 0);
179}
180
181void user_enable_block_step(struct task_struct *child)
182{
183 enable_step(child, 1);
184}
185
186void user_disable_single_step(struct task_struct *child)
187{
188 /*
189 * Make sure block stepping (BTF) is disabled.
190 */
191 write_debugctlmsr(child,
192 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
193
194 if (!child->thread.debugctlmsr)
195 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
196
197 /* Always clear TIF_SINGLESTEP... */
198 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
199
200 /* But touch TF only if it was set by us.. */
201 if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
202 task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
203}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 2e5efaaf8800..09199511c256 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -17,9 +17,26 @@
17/* References to section boundaries */ 17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end; 18extern const void __nosave_begin, __nosave_end;
19 19
20static void fix_processor_context(void);
21
20struct saved_context saved_context; 22struct saved_context saved_context;
21 23
22void __save_processor_state(struct saved_context *ctxt) 24/**
25 * __save_processor_state - save CPU registers before creating a
26 * hibernation image and before restoring the memory state from it
27 * @ctxt - structure to store the registers contents in
28 *
29 * NOTE: If there is a CPU register the modification of which by the
30 * boot kernel (ie. the kernel used for loading the hibernation image)
31 * might affect the operations of the restored target kernel (ie. the one
32 * saved in the hibernation image), then its contents must be saved by this
33 * function. In other words, if kernel A is hibernated and different
34 * kernel B is used for loading the hibernation image into memory, the
35 * kernel A's __save_processor_state() function must save all registers
36 * needed by kernel A, so that it can operate correctly after the resume
37 * regardless of what kernel B does in the meantime.
38 */
39static void __save_processor_state(struct saved_context *ctxt)
23{ 40{
24 kernel_fpu_begin(); 41 kernel_fpu_begin();
25 42
@@ -69,7 +86,12 @@ static void do_fpu_end(void)
69 kernel_fpu_end(); 86 kernel_fpu_end();
70} 87}
71 88
72void __restore_processor_state(struct saved_context *ctxt) 89/**
90 * __restore_processor_state - restore the contents of CPU registers saved
91 * by __save_processor_state()
92 * @ctxt - structure to load the registers contents from
93 */
94static void __restore_processor_state(struct saved_context *ctxt)
73{ 95{
74 /* 96 /*
75 * control registers 97 * control registers
@@ -113,14 +135,14 @@ void restore_processor_state(void)
113 __restore_processor_state(&saved_context); 135 __restore_processor_state(&saved_context);
114} 136}
115 137
116void fix_processor_context(void) 138static void fix_processor_context(void)
117{ 139{
118 int cpu = smp_processor_id(); 140 int cpu = smp_processor_id();
119 struct tss_struct *t = &per_cpu(init_tss, cpu); 141 struct tss_struct *t = &per_cpu(init_tss, cpu);
120 142
121 set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ 143 set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
122 144
123 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; 145 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
124 146
125 syscall_init(); /* This sets MSR_*STAR and related */ 147 syscall_init(); /* This sets MSR_*STAR and related */
126 load_TR_desc(); /* This does ltr */ 148 load_TR_desc(); /* This does ltr */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
index 72f952103e50..aeb9a4d7681e 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
18 18
19ENTRY(swsusp_arch_suspend) 19ENTRY(swsusp_arch_suspend)
20 movq $saved_context, %rax 20 movq $saved_context, %rax
21 movq %rsp, pt_regs_rsp(%rax) 21 movq %rsp, pt_regs_sp(%rax)
22 movq %rbp, pt_regs_rbp(%rax) 22 movq %rbp, pt_regs_bp(%rax)
23 movq %rsi, pt_regs_rsi(%rax) 23 movq %rsi, pt_regs_si(%rax)
24 movq %rdi, pt_regs_rdi(%rax) 24 movq %rdi, pt_regs_di(%rax)
25 movq %rbx, pt_regs_rbx(%rax) 25 movq %rbx, pt_regs_bx(%rax)
26 movq %rcx, pt_regs_rcx(%rax) 26 movq %rcx, pt_regs_cx(%rax)
27 movq %rdx, pt_regs_rdx(%rax) 27 movq %rdx, pt_regs_dx(%rax)
28 movq %r8, pt_regs_r8(%rax) 28 movq %r8, pt_regs_r8(%rax)
29 movq %r9, pt_regs_r9(%rax) 29 movq %r9, pt_regs_r9(%rax)
30 movq %r10, pt_regs_r10(%rax) 30 movq %r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
34 movq %r14, pt_regs_r14(%rax) 34 movq %r14, pt_regs_r14(%rax)
35 movq %r15, pt_regs_r15(%rax) 35 movq %r15, pt_regs_r15(%rax)
36 pushfq 36 pushfq
37 popq pt_regs_eflags(%rax) 37 popq pt_regs_flags(%rax)
38 38
39 /* save the address of restore_registers */ 39 /* save the address of restore_registers */
40 movq $restore_registers, %rax 40 movq $restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
115 115
116 /* We don't restore %rax, it must be 0 anyway */ 116 /* We don't restore %rax, it must be 0 anyway */
117 movq $saved_context, %rax 117 movq $saved_context, %rax
118 movq pt_regs_rsp(%rax), %rsp 118 movq pt_regs_sp(%rax), %rsp
119 movq pt_regs_rbp(%rax), %rbp 119 movq pt_regs_bp(%rax), %rbp
120 movq pt_regs_rsi(%rax), %rsi 120 movq pt_regs_si(%rax), %rsi
121 movq pt_regs_rdi(%rax), %rdi 121 movq pt_regs_di(%rax), %rdi
122 movq pt_regs_rbx(%rax), %rbx 122 movq pt_regs_bx(%rax), %rbx
123 movq pt_regs_rcx(%rax), %rcx 123 movq pt_regs_cx(%rax), %rcx
124 movq pt_regs_rdx(%rax), %rdx 124 movq pt_regs_dx(%rax), %rdx
125 movq pt_regs_r8(%rax), %r8 125 movq pt_regs_r8(%rax), %r8
126 movq pt_regs_r9(%rax), %r9 126 movq pt_regs_r9(%rax), %r9
127 movq pt_regs_r10(%rax), %r10 127 movq pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
130 movq pt_regs_r13(%rax), %r13 130 movq pt_regs_r13(%rax), %r13
131 movq pt_regs_r14(%rax), %r14 131 movq pt_regs_r14(%rax), %r14
132 movq pt_regs_r15(%rax), %r15 132 movq pt_regs_r15(%rax), %r15
133 pushq pt_regs_eflags(%rax) 133 pushq pt_regs_flags(%rax)
134 popfq 134 popfq
135 135
136 xorq %rax, %rax 136 xorq %rax, %rax
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 907942ee6e76..bd802a5e1aa3 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -12,6 +12,7 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/ia32.h> 18#include <asm/ia32.h>
@@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
65 unsigned long *end) 66 unsigned long *end)
66{ 67{
67 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { 68 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
69 unsigned long new_begin;
68 /* This is usually used needed to map code in small 70 /* This is usually used needed to map code in small
69 model, so it needs to be in the first 31bit. Limit 71 model, so it needs to be in the first 31bit. Limit
70 it to that. This means we need to move the 72 it to that. This means we need to move the
@@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
74 of playground for now. -AK */ 76 of playground for now. -AK */
75 *begin = 0x40000000; 77 *begin = 0x40000000;
76 *end = 0x80000000; 78 *end = 0x80000000;
79 if (current->flags & PF_RANDOMIZE) {
80 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
81 if (new_begin)
82 *begin = new_begin;
83 }
77 } else { 84 } else {
78 *begin = TASK_UNMAPPED_BASE; 85 *begin = TASK_UNMAPPED_BASE;
79 *end = TASK_SIZE; 86 *end = TASK_SIZE;
@@ -143,6 +150,97 @@ full_search:
143 } 150 }
144} 151}
145 152
153
154unsigned long
155arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
156 const unsigned long len, const unsigned long pgoff,
157 const unsigned long flags)
158{
159 struct vm_area_struct *vma;
160 struct mm_struct *mm = current->mm;
161 unsigned long addr = addr0;
162
163 /* requested length too big for entire address space */
164 if (len > TASK_SIZE)
165 return -ENOMEM;
166
167 if (flags & MAP_FIXED)
168 return addr;
169
170 /* for MAP_32BIT mappings we force the legact mmap base */
171 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
172 goto bottomup;
173
174 /* requesting a specific address */
175 if (addr) {
176 addr = PAGE_ALIGN(addr);
177 vma = find_vma(mm, addr);
178 if (TASK_SIZE - len >= addr &&
179 (!vma || addr + len <= vma->vm_start))
180 return addr;
181 }
182
183 /* check if free_area_cache is useful for us */
184 if (len <= mm->cached_hole_size) {
185 mm->cached_hole_size = 0;
186 mm->free_area_cache = mm->mmap_base;
187 }
188
189 /* either no address requested or can't fit in requested address hole */
190 addr = mm->free_area_cache;
191
192 /* make sure it can fit in the remaining address space */
193 if (addr > len) {
194 vma = find_vma(mm, addr-len);
195 if (!vma || addr <= vma->vm_start)
196 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr-len);
198 }
199
200 if (mm->mmap_base < len)
201 goto bottomup;
202
203 addr = mm->mmap_base-len;
204
205 do {
206 /*
207 * Lookup failure means no vma is above this address,
208 * else if new region fits below vma->vm_start,
209 * return with success:
210 */
211 vma = find_vma(mm, addr);
212 if (!vma || addr+len <= vma->vm_start)
213 /* remember the address as a hint for next time */
214 return (mm->free_area_cache = addr);
215
216 /* remember the largest hole we saw so far */
217 if (addr + mm->cached_hole_size < vma->vm_start)
218 mm->cached_hole_size = vma->vm_start - addr;
219
220 /* try just below the current vma->vm_start */
221 addr = vma->vm_start-len;
222 } while (len < vma->vm_start);
223
224bottomup:
225 /*
226 * A failed mmap() very likely causes application failure,
227 * so fall back to the bottom-up function here. This scenario
228 * can happen with large stack limits and large mmap()
229 * allocations.
230 */
231 mm->cached_hole_size = ~0UL;
232 mm->free_area_cache = TASK_UNMAPPED_BASE;
233 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
234 /*
235 * Restore the topdown base:
236 */
237 mm->free_area_cache = mm->mmap_base;
238 mm->cached_hole_size = ~0UL;
239
240 return addr;
241}
242
243
146asmlinkage long sys_uname(struct new_utsname __user * name) 244asmlinkage long sys_uname(struct new_utsname __user * name)
147{ 245{
148 int err; 246 int err;
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
new file mode 100644
index 000000000000..6d7ef11e7975
--- /dev/null
+++ b/arch/x86/kernel/test_nx.c
@@ -0,0 +1,176 @@
1/*
2 * test_nx.c: functional test for NX functionality
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <linux/sort.h>
14#include <asm/uaccess.h>
15
16extern int rodata_test_data;
17
18/*
19 * This file checks 4 things:
20 * 1) Check if the stack is not executable
21 * 2) Check if kmalloc memory is not executable
22 * 3) Check if the .rodata section is not executable
23 * 4) Check if the .data section of a module is not executable
24 *
25 * To do this, the test code tries to execute memory in stack/kmalloc/etc,
26 * and then checks if the expected trap happens.
27 *
28 * Sadly, this implies having a dynamic exception handling table entry.
29 * ... which can be done (and will make Rusty cry)... but it can only
30 * be done in a stand-alone module with only 1 entry total.
31 * (otherwise we'd have to sort and that's just too messy)
32 */
33
34
35
36/*
37 * We want to set up an exception handling point on our stack,
38 * which means a variable value. This function is rather dirty
39 * and walks the exception table of the module, looking for a magic
40 * marker and replaces it with a specific function.
41 */
42static void fudze_exception_table(void *marker, void *new)
43{
44 struct module *mod = THIS_MODULE;
45 struct exception_table_entry *extable;
46
47 /*
48 * Note: This module has only 1 exception table entry,
49 * so searching and sorting is not needed. If that changes,
50 * this would be the place to search and re-sort the exception
51 * table.
52 */
53 if (mod->num_exentries > 1) {
54 printk(KERN_ERR "test_nx: too many exception table entries!\n");
55 printk(KERN_ERR "test_nx: test results are not reliable.\n");
56 return;
57 }
58 extable = (struct exception_table_entry *)mod->extable;
59 extable[0].insn = (unsigned long)new;
60}
61
62
63/*
64 * exception tables get their symbols translated so we need
65 * to use a fake function to put in there, which we can then
66 * replace at runtime.
67 */
68void foo_label(void);
69
70/*
71 * returns 0 for not-executable, negative for executable
72 *
73 * Note: we cannot allow this function to be inlined, because
74 * that would give us more than 1 exception table entry.
75 * This in turn would break the assumptions above.
76 */
77static noinline int test_address(void *address)
78{
79 unsigned long result;
80
81 /* Set up an exception table entry for our address */
82 fudze_exception_table(&foo_label, address);
83 result = 1;
84 asm volatile(
85 "foo_label:\n"
86 "0: call *%[fake_code]\n"
87 "1:\n"
88 ".section .fixup,\"ax\"\n"
89 "2: mov %[zero], %[rslt]\n"
90 " ret\n"
91 ".previous\n"
92 ".section __ex_table,\"a\"\n"
93 " .align 8\n"
94 " .quad 0b\n"
95 " .quad 2b\n"
96 ".previous\n"
97 : [rslt] "=r" (result)
98 : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
99 );
100 /* change the exception table back for the next round */
101 fudze_exception_table(address, &foo_label);
102
103 if (result)
104 return -ENODEV;
105 return 0;
106}
107
108static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
109
110static int test_NX(void)
111{
112 int ret = 0;
113 /* 0xC3 is the opcode for "ret" */
114 char stackcode[] = {0xC3, 0x90, 0 };
115 char *heap;
116
117 test_data = 0xC3;
118
119 printk(KERN_INFO "Testing NX protection\n");
120
121 /* Test 1: check if the stack is not executable */
122 if (test_address(&stackcode)) {
123 printk(KERN_ERR "test_nx: stack was executable\n");
124 ret = -ENODEV;
125 }
126
127
128 /* Test 2: Check if the heap is executable */
129 heap = kmalloc(64, GFP_KERNEL);
130 if (!heap)
131 return -ENOMEM;
132 heap[0] = 0xC3; /* opcode for "ret" */
133
134 if (test_address(heap)) {
135 printk(KERN_ERR "test_nx: heap was executable\n");
136 ret = -ENODEV;
137 }
138 kfree(heap);
139
140 /*
141 * The following 2 tests currently fail, this needs to get fixed
142 * Until then, don't run them to avoid too many people getting scared
143 * by the error message
144 */
145#if 0
146
147#ifdef CONFIG_DEBUG_RODATA
148 /* Test 3: Check if the .rodata section is executable */
149 if (rodata_test_data != 0xC3) {
150 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
151 ret = -ENODEV;
152 } else if (test_address(&rodata_test_data)) {
153 printk(KERN_ERR "test_nx: .rodata section is executable\n");
154 ret = -ENODEV;
155 }
156#endif
157
158 /* Test 4: Check if the .data section of a module is executable */
159 if (test_address(&test_data)) {
160 printk(KERN_ERR "test_nx: .data section is executable\n");
161 ret = -ENODEV;
162 }
163
164#endif
165 return 0;
166}
167
168static void test_exit(void)
169{
170}
171
172module_init(test_NX);
173module_exit(test_exit);
174MODULE_LICENSE("GPL");
175MODULE_DESCRIPTION("Testcase for the NX infrastructure");
176MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
new file mode 100644
index 000000000000..4c163772000e
--- /dev/null
+++ b/arch/x86/kernel/test_rodata.c
@@ -0,0 +1,86 @@
1/*
2 * test_rodata.c: functional test for mark_rodata_ro function
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <asm/sections.h>
14extern int rodata_test_data;
15
16int rodata_test(void)
17{
18 unsigned long result;
19 unsigned long start, end;
20
21 /* test 1: read the value */
22 /* If this test fails, some previous testrun has clobbered the state */
23 if (!rodata_test_data) {
24 printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
25 return -ENODEV;
26 }
27
28 /* test 2: write to the variable; this should fault */
29 /*
30 * If this test fails, we managed to overwrite the data
31 *
32 * This is written in assembly to be able to catch the
33 * exception that is supposed to happen in the correct
34 * case
35 */
36
37 result = 1;
38 asm volatile(
39 "0: mov %[zero],(%[rodata_test])\n"
40 " mov %[zero], %[rslt]\n"
41 "1:\n"
42 ".section .fixup,\"ax\"\n"
43 "2: jmp 1b\n"
44 ".previous\n"
45 ".section __ex_table,\"a\"\n"
46 " .align 16\n"
47#ifdef CONFIG_X86_32
48 " .long 0b,2b\n"
49#else
50 " .quad 0b,2b\n"
51#endif
52 ".previous"
53 : [rslt] "=r" (result)
54 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
55 );
56
57
58 if (!result) {
59 printk(KERN_ERR "rodata_test: test data was not read only\n");
60 return -ENODEV;
61 }
62
63 /* test 3: check the value hasn't changed */
64 /* If this test fails, we managed to overwrite the data */
65 if (!rodata_test_data) {
66 printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
67 return -ENODEV;
68 }
69 /* test 4: check if the rodata section is 4Kb aligned */
70 start = (unsigned long)__start_rodata;
71 end = (unsigned long)__end_rodata;
72 if (start & (PAGE_SIZE - 1)) {
73 printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
74 return -ENODEV;
75 }
76 if (end & (PAGE_SIZE - 1)) {
77 printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
78 return -ENODEV;
79 }
80
81 return 0;
82}
83
84MODULE_LICENSE("GPL");
85MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
86MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 8a322c96bc23..1a89e93f3f1c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,98 +28,20 @@
28 * serialize accesses to xtime/lost_ticks). 28 * serialize accesses to xtime/lost_ticks).
29 */ 29 */
30 30
31#include <linux/errno.h> 31#include <linux/init.h>
32#include <linux/sched.h>
33#include <linux/kernel.h>
34#include <linux/param.h>
35#include <linux/string.h>
36#include <linux/mm.h>
37#include <linux/interrupt.h> 32#include <linux/interrupt.h>
38#include <linux/time.h> 33#include <linux/time.h>
39#include <linux/delay.h>
40#include <linux/init.h>
41#include <linux/smp.h>
42#include <linux/module.h>
43#include <linux/sysdev.h>
44#include <linux/bcd.h>
45#include <linux/efi.h>
46#include <linux/mca.h> 34#include <linux/mca.h>
47 35
48#include <asm/io.h>
49#include <asm/smp.h>
50#include <asm/irq.h>
51#include <asm/msr.h>
52#include <asm/delay.h>
53#include <asm/mpspec.h>
54#include <asm/uaccess.h>
55#include <asm/processor.h>
56#include <asm/timer.h>
57#include <asm/time.h>
58
59#include "mach_time.h"
60
61#include <linux/timex.h>
62
63#include <asm/hpet.h>
64
65#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
66 37#include <asm/hpet.h>
67#include "io_ports.h" 38#include <asm/time.h>
68
69#include <asm/i8259.h>
70 39
71#include "do_timer.h" 40#include "do_timer.h"
72 41
73unsigned int cpu_khz; /* Detected as we calibrate the TSC */ 42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
74EXPORT_SYMBOL(cpu_khz); 43EXPORT_SYMBOL(cpu_khz);
75 44
76DEFINE_SPINLOCK(rtc_lock);
77EXPORT_SYMBOL(rtc_lock);
78
79/*
80 * This is a special lock that is owned by the CPU and holds the index
81 * register we are working with. It is required for NMI access to the
82 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
83 */
84volatile unsigned long cmos_lock = 0;
85EXPORT_SYMBOL(cmos_lock);
86
87/* Routines for accessing the CMOS RAM/RTC. */
88unsigned char rtc_cmos_read(unsigned char addr)
89{
90 unsigned char val;
91 lock_cmos_prefix(addr);
92 outb_p(addr, RTC_PORT(0));
93 val = inb_p(RTC_PORT(1));
94 lock_cmos_suffix(addr);
95 return val;
96}
97EXPORT_SYMBOL(rtc_cmos_read);
98
99void rtc_cmos_write(unsigned char val, unsigned char addr)
100{
101 lock_cmos_prefix(addr);
102 outb_p(addr, RTC_PORT(0));
103 outb_p(val, RTC_PORT(1));
104 lock_cmos_suffix(addr);
105}
106EXPORT_SYMBOL(rtc_cmos_write);
107
108static int set_rtc_mmss(unsigned long nowtime)
109{
110 int retval;
111 unsigned long flags;
112
113 /* gets recalled with irq locally disabled */
114 /* XXX - does irqsave resolve this? -johnstul */
115 spin_lock_irqsave(&rtc_lock, flags);
116 retval = set_wallclock(nowtime);
117 spin_unlock_irqrestore(&rtc_lock, flags);
118
119 return retval;
120}
121
122
123int timer_ack; 45int timer_ack;
124 46
125unsigned long profile_pc(struct pt_regs *regs) 47unsigned long profile_pc(struct pt_regs *regs)
@@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs)
127 unsigned long pc = instruction_pointer(regs); 49 unsigned long pc = instruction_pointer(regs);
128 50
129#ifdef CONFIG_SMP 51#ifdef CONFIG_SMP
130 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && 52 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
131 in_lock_functions(pc)) { 53 in_lock_functions(pc)) {
132#ifdef CONFIG_FRAME_POINTER 54#ifdef CONFIG_FRAME_POINTER
133 return *(unsigned long *)(regs->ebp + 4); 55 return *(unsigned long *)(regs->bp + 4);
134#else 56#else
135 unsigned long *sp = (unsigned long *)&regs->esp; 57 unsigned long *sp = (unsigned long *)&regs->sp;
136 58
137 /* Return address is either directly at stack pointer 59 /* Return address is either directly at stack pointer
138 or above a saved eflags. Eflags has bits 22-31 zero, 60 or above a saved flags. Eflags has bits 22-31 zero,
139 kernel addresses don't. */ 61 kernel addresses don't. */
140 if (sp[0] >> 22) 62 if (sp[0] >> 22)
141 return sp[0]; 63 return sp[0];
142 if (sp[1] >> 22) 64 if (sp[1] >> 22)
143 return sp[1]; 65 return sp[1];
@@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
193 return IRQ_HANDLED; 115 return IRQ_HANDLED;
194} 116}
195 117
196/* not static: needed by APM */
197unsigned long read_persistent_clock(void)
198{
199 unsigned long retval;
200 unsigned long flags;
201
202 spin_lock_irqsave(&rtc_lock, flags);
203
204 retval = get_wallclock();
205
206 spin_unlock_irqrestore(&rtc_lock, flags);
207
208 return retval;
209}
210
211int update_persistent_clock(struct timespec now)
212{
213 return set_rtc_mmss(now.tv_sec);
214}
215
216extern void (*late_time_init)(void); 118extern void (*late_time_init)(void);
217/* Duplicate of time_init() below, with hpet_enable part added */ 119/* Duplicate of time_init() below, with hpet_enable part added */
218void __init hpet_time_init(void) 120void __init hpet_time_init(void)
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 368b1942b39a..0380795121a6 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -11,43 +11,18 @@
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c 11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */ 12 */
13 13
14#include <linux/kernel.h> 14#include <linux/clockchips.h>
15#include <linux/sched.h>
16#include <linux/interrupt.h>
17#include <linux/init.h> 15#include <linux/init.h>
18#include <linux/mc146818rtc.h> 16#include <linux/interrupt.h>
19#include <linux/time.h>
20#include <linux/ioport.h>
21#include <linux/module.h> 17#include <linux/module.h>
22#include <linux/device.h> 18#include <linux/time.h>
23#include <linux/sysdev.h>
24#include <linux/bcd.h>
25#include <linux/notifier.h>
26#include <linux/cpu.h>
27#include <linux/kallsyms.h>
28#include <linux/acpi.h>
29#include <linux/clockchips.h>
30 19
31#ifdef CONFIG_ACPI
32#include <acpi/achware.h> /* for PM timer frequency */
33#include <acpi/acpi_bus.h>
34#endif
35#include <asm/i8253.h> 20#include <asm/i8253.h>
36#include <asm/pgtable.h>
37#include <asm/vsyscall.h>
38#include <asm/timex.h>
39#include <asm/proto.h>
40#include <asm/hpet.h>
41#include <asm/sections.h>
42#include <linux/hpet.h>
43#include <asm/apic.h>
44#include <asm/hpet.h> 21#include <asm/hpet.h>
45#include <asm/mpspec.h>
46#include <asm/nmi.h> 22#include <asm/nmi.h>
47#include <asm/vgtod.h> 23#include <asm/vgtod.h>
48 24#include <asm/time.h>
49DEFINE_SPINLOCK(rtc_lock); 25#include <asm/timer.h>
50EXPORT_SYMBOL(rtc_lock);
51 26
52volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 27volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
53 28
@@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs)
56 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
57 32
58 /* Assume the lock function has either no stack frame or a copy 33 /* Assume the lock function has either no stack frame or a copy
59 of eflags from PUSHF 34 of flags from PUSHF
60 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 35 Eflags always has bits 22 and up cleared unlike kernel addresses. */
61 if (!user_mode(regs) && in_lock_functions(pc)) { 36 if (!user_mode(regs) && in_lock_functions(pc)) {
62 unsigned long *sp = (unsigned long *)regs->rsp; 37 unsigned long *sp = (unsigned long *)regs->sp;
63 if (sp[0] >> 22) 38 if (sp[0] >> 22)
64 return sp[0]; 39 return sp[0];
65 if (sp[1] >> 22) 40 if (sp[1] >> 22)
@@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs)
69} 44}
70EXPORT_SYMBOL(profile_pc); 45EXPORT_SYMBOL(profile_pc);
71 46
72/*
73 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
74 * ms after the second nowtime has started, because when nowtime is written
75 * into the registers of the CMOS clock, it will jump to the next second
76 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
77 * sheet for details.
78 */
79
80static int set_rtc_mmss(unsigned long nowtime)
81{
82 int retval = 0;
83 int real_seconds, real_minutes, cmos_minutes;
84 unsigned char control, freq_select;
85 unsigned long flags;
86
87/*
88 * set_rtc_mmss is called when irqs are enabled, so disable irqs here
89 */
90 spin_lock_irqsave(&rtc_lock, flags);
91/*
92 * Tell the clock it's being set and stop it.
93 */
94 control = CMOS_READ(RTC_CONTROL);
95 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
96
97 freq_select = CMOS_READ(RTC_FREQ_SELECT);
98 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
99
100 cmos_minutes = CMOS_READ(RTC_MINUTES);
101 BCD_TO_BIN(cmos_minutes);
102
103/*
104 * since we're only adjusting minutes and seconds, don't interfere with hour
105 * overflow. This avoids messing with unknown time zones but requires your RTC
106 * not to be off by more than 15 minutes. Since we're calling it only when
107 * our clock is externally synchronized using NTP, this shouldn't be a problem.
108 */
109
110 real_seconds = nowtime % 60;
111 real_minutes = nowtime / 60;
112 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
113 real_minutes += 30; /* correct for half hour time zone */
114 real_minutes %= 60;
115
116 if (abs(real_minutes - cmos_minutes) >= 30) {
117 printk(KERN_WARNING "time.c: can't update CMOS clock "
118 "from %d to %d\n", cmos_minutes, real_minutes);
119 retval = -1;
120 } else {
121 BIN_TO_BCD(real_seconds);
122 BIN_TO_BCD(real_minutes);
123 CMOS_WRITE(real_seconds, RTC_SECONDS);
124 CMOS_WRITE(real_minutes, RTC_MINUTES);
125 }
126
127/*
128 * The following flags have to be released exactly in this order, otherwise the
129 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
130 * not reset the oscillator and will not update precisely 500 ms later. You
131 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
132 * believes data sheets anyway ... -- Markus Kuhn
133 */
134
135 CMOS_WRITE(control, RTC_CONTROL);
136 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
137
138 spin_unlock_irqrestore(&rtc_lock, flags);
139
140 return retval;
141}
142
143int update_persistent_clock(struct timespec now)
144{
145 return set_rtc_mmss(now.tv_sec);
146}
147
148static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 47static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
149{ 48{
150 add_pda(irq0_irqs, 1); 49 add_pda(irq0_irqs, 1);
@@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
154 return IRQ_HANDLED; 53 return IRQ_HANDLED;
155} 54}
156 55
157unsigned long read_persistent_clock(void)
158{
159 unsigned int year, mon, day, hour, min, sec;
160 unsigned long flags;
161 unsigned century = 0;
162
163 spin_lock_irqsave(&rtc_lock, flags);
164 /*
165 * if UIP is clear, then we have >= 244 microseconds before RTC
166 * registers will be updated. Spec sheet says that this is the
167 * reliable way to read RTC - registers invalid (off bus) during update
168 */
169 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
170 cpu_relax();
171
172
173 /* now read all RTC registers while stable with interrupts disabled */
174 sec = CMOS_READ(RTC_SECONDS);
175 min = CMOS_READ(RTC_MINUTES);
176 hour = CMOS_READ(RTC_HOURS);
177 day = CMOS_READ(RTC_DAY_OF_MONTH);
178 mon = CMOS_READ(RTC_MONTH);
179 year = CMOS_READ(RTC_YEAR);
180#ifdef CONFIG_ACPI
181 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
182 acpi_gbl_FADT.century)
183 century = CMOS_READ(acpi_gbl_FADT.century);
184#endif
185 spin_unlock_irqrestore(&rtc_lock, flags);
186
187 /*
188 * We know that x86-64 always uses BCD format, no need to check the
189 * config register.
190 */
191
192 BCD_TO_BIN(sec);
193 BCD_TO_BIN(min);
194 BCD_TO_BIN(hour);
195 BCD_TO_BIN(day);
196 BCD_TO_BIN(mon);
197 BCD_TO_BIN(year);
198
199 if (century) {
200 BCD_TO_BIN(century);
201 year += century * 100;
202 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
203 } else {
204 /*
205 * x86-64 systems only exists since 2002.
206 * This will work up to Dec 31, 2100
207 */
208 year += 2000;
209 }
210
211 return mktime(year, mon, day, hour, min, sec);
212}
213
214/* calibrate_cpu is used on systems with fixed rate TSCs to determine 56/* calibrate_cpu is used on systems with fixed rate TSCs to determine
215 * processor frequency */ 57 * processor frequency */
216#define TICK_COUNT 100000000 58#define TICK_COUNT 100000000
217static unsigned int __init tsc_calibrate_cpu_khz(void) 59unsigned long __init native_calculate_cpu_khz(void)
218{ 60{
219 int tsc_start, tsc_now; 61 int tsc_start, tsc_now;
220 int i, no_ctr_free; 62 int i, no_ctr_free;
@@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void)
241 rdtscl(tsc_start); 83 rdtscl(tsc_start);
242 do { 84 do {
243 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); 85 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
244 tsc_now = get_cycles_sync(); 86 tsc_now = get_cycles();
245 } while ((tsc_now - tsc_start) < TICK_COUNT); 87 } while ((tsc_now - tsc_start) < TICK_COUNT);
246 88
247 local_irq_restore(flags); 89 local_irq_restore(flags);
@@ -264,20 +106,22 @@ static struct irqaction irq0 = {
264 .name = "timer" 106 .name = "timer"
265}; 107};
266 108
267void __init time_init(void) 109void __init hpet_time_init(void)
268{ 110{
269 if (!hpet_enable()) 111 if (!hpet_enable())
270 setup_pit_timer(); 112 setup_pit_timer();
271 113
272 setup_irq(0, &irq0); 114 setup_irq(0, &irq0);
115}
273 116
117void __init time_init(void)
118{
274 tsc_calibrate(); 119 tsc_calibrate();
275 120
276 cpu_khz = tsc_khz; 121 cpu_khz = tsc_khz;
277 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
278 boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
279 boot_cpu_data.x86 == 16) 124 cpu_khz = calculate_cpu_khz();
280 cpu_khz = tsc_calibrate_cpu_khz();
281 125
282 if (unsynchronized_tsc()) 126 if (unsynchronized_tsc())
283 mark_tsc_unstable("TSCs unsynchronized"); 127 mark_tsc_unstable("TSCs unsynchronized");
@@ -290,4 +134,5 @@ void __init time_init(void)
290 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", 134 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
291 cpu_khz / 1000, cpu_khz % 1000); 135 cpu_khz / 1000, cpu_khz % 1000);
292 init_tsc_clocksource(); 136 init_tsc_clocksource();
137 late_time_init = choose_time_init();
293} 138}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 000000000000..6dfd4e76661a
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,213 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5#include <linux/regset.h>
6
7#include <asm/uaccess.h>
8#include <asm/desc.h>
9#include <asm/system.h>
10#include <asm/ldt.h>
11#include <asm/processor.h>
12#include <asm/proto.h>
13
14#include "tls.h"
15
16/*
17 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
18 */
19static int get_free_idx(void)
20{
21 struct thread_struct *t = &current->thread;
22 int idx;
23
24 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
25 if (desc_empty(&t->tls_array[idx]))
26 return idx + GDT_ENTRY_TLS_MIN;
27 return -ESRCH;
28}
29
30static void set_tls_desc(struct task_struct *p, int idx,
31 const struct user_desc *info, int n)
32{
33 struct thread_struct *t = &p->thread;
34 struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
35 int cpu;
36
37 /*
38 * We must not get preempted while modifying the TLS.
39 */
40 cpu = get_cpu();
41
42 while (n-- > 0) {
43 if (LDT_empty(info))
44 desc->a = desc->b = 0;
45 else
46 fill_ldt(desc, info);
47 ++info;
48 ++desc;
49 }
50
51 if (t == &current->thread)
52 load_TLS(t, cpu);
53
54 put_cpu();
55}
56
57/*
58 * Set a given TLS descriptor:
59 */
60int do_set_thread_area(struct task_struct *p, int idx,
61 struct user_desc __user *u_info,
62 int can_allocate)
63{
64 struct user_desc info;
65
66 if (copy_from_user(&info, u_info, sizeof(info)))
67 return -EFAULT;
68
69 if (idx == -1)
70 idx = info.entry_number;
71
72 /*
73 * index -1 means the kernel should try to find and
74 * allocate an empty descriptor:
75 */
76 if (idx == -1 && can_allocate) {
77 idx = get_free_idx();
78 if (idx < 0)
79 return idx;
80 if (put_user(idx, &u_info->entry_number))
81 return -EFAULT;
82 }
83
84 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
85 return -EINVAL;
86
87 set_tls_desc(p, idx, &info, 1);
88
89 return 0;
90}
91
92asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
93{
94 return do_set_thread_area(current, -1, u_info, 1);
95}
96
97
98/*
99 * Get the current Thread-Local Storage area:
100 */
101
102static void fill_user_desc(struct user_desc *info, int idx,
103 const struct desc_struct *desc)
104
105{
106 memset(info, 0, sizeof(*info));
107 info->entry_number = idx;
108 info->base_addr = get_desc_base(desc);
109 info->limit = get_desc_limit(desc);
110 info->seg_32bit = desc->d;
111 info->contents = desc->type >> 2;
112 info->read_exec_only = !(desc->type & 2);
113 info->limit_in_pages = desc->g;
114 info->seg_not_present = !desc->p;
115 info->useable = desc->avl;
116#ifdef CONFIG_X86_64
117 info->lm = desc->l;
118#endif
119}
120
121int do_get_thread_area(struct task_struct *p, int idx,
122 struct user_desc __user *u_info)
123{
124 struct user_desc info;
125
126 if (idx == -1 && get_user(idx, &u_info->entry_number))
127 return -EFAULT;
128
129 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
130 return -EINVAL;
131
132 fill_user_desc(&info, idx,
133 &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
134
135 if (copy_to_user(u_info, &info, sizeof(info)))
136 return -EFAULT;
137 return 0;
138}
139
140asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
141{
142 return do_get_thread_area(current, -1, u_info);
143}
144
145int regset_tls_active(struct task_struct *target,
146 const struct user_regset *regset)
147{
148 struct thread_struct *t = &target->thread;
149 int n = GDT_ENTRY_TLS_ENTRIES;
150 while (n > 0 && desc_empty(&t->tls_array[n - 1]))
151 --n;
152 return n;
153}
154
155int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
156 unsigned int pos, unsigned int count,
157 void *kbuf, void __user *ubuf)
158{
159 const struct desc_struct *tls;
160
161 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
162 (pos % sizeof(struct user_desc)) != 0 ||
163 (count % sizeof(struct user_desc)) != 0)
164 return -EINVAL;
165
166 pos /= sizeof(struct user_desc);
167 count /= sizeof(struct user_desc);
168
169 tls = &target->thread.tls_array[pos];
170
171 if (kbuf) {
172 struct user_desc *info = kbuf;
173 while (count-- > 0)
174 fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
175 tls++);
176 } else {
177 struct user_desc __user *u_info = ubuf;
178 while (count-- > 0) {
179 struct user_desc info;
180 fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
181 if (__copy_to_user(u_info++, &info, sizeof(info)))
182 return -EFAULT;
183 }
184 }
185
186 return 0;
187}
188
189int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
190 unsigned int pos, unsigned int count,
191 const void *kbuf, const void __user *ubuf)
192{
193 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
194 const struct user_desc *info;
195
196 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
197 (pos % sizeof(struct user_desc)) != 0 ||
198 (count % sizeof(struct user_desc)) != 0)
199 return -EINVAL;
200
201 if (kbuf)
202 info = kbuf;
203 else if (__copy_from_user(infobuf, ubuf, count))
204 return -EFAULT;
205 else
206 info = infobuf;
207
208 set_tls_desc(target,
209 GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
210 info, count / sizeof(struct user_desc));
211
212 return 0;
213}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 000000000000..2f083a2fe216
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
1/*
2 * Internal declarations for x86 TLS implementation functions.
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 */
12
13#ifndef _ARCH_X86_KERNEL_TLS_H
14
15#include <linux/regset.h>
16
17extern user_regset_active_fn regset_tls_active;
18extern user_regset_get_fn regset_tls_get;
19extern user_regset_set_fn regset_tls_set;
20
21#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e16d675eb85..78cbb655aa79 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,9 +31,10 @@
31#include <linux/mmzone.h> 31#include <linux/mmzone.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static struct i386_cpu cpu_devices[NR_CPUS]; 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
35 35
36int __cpuinit arch_register_cpu(int num) 36#ifdef CONFIG_HOTPLUG_CPU
37int arch_register_cpu(int num)
37{ 38{
38 /* 39 /*
39 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
@@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num)
44 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's. 46 * for all CPU's.
46 */ 47 */
47#ifdef CONFIG_HOTPLUG_CPU
48 if (num) 48 if (num)
49 cpu_devices[num].cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50#endif 50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51
52 return register_cpu(&cpu_devices[num].cpu, num);
53} 51}
52EXPORT_SYMBOL(arch_register_cpu);
54 53
55#ifdef CONFIG_HOTPLUG_CPU
56void arch_unregister_cpu(int num) 54void arch_unregister_cpu(int num)
57{ 55{
58 return unregister_cpu(&cpu_devices[num].cpu); 56 return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
59} 57}
60EXPORT_SYMBOL(arch_register_cpu);
61EXPORT_SYMBOL(arch_unregister_cpu); 58EXPORT_SYMBOL(arch_unregister_cpu);
59#else
60int arch_register_cpu(int num)
61{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63}
64EXPORT_SYMBOL(arch_register_cpu);
62#endif /*CONFIG_HOTPLUG_CPU*/ 65#endif /*CONFIG_HOTPLUG_CPU*/
63 66
64static int __init topology_init(void) 67static int __init topology_init(void)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 02d1e1e58e81..3cf72977d012 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,8 @@ char ignore_fpu_irq = 0;
76 * F0 0F bug workaround.. We have a special link segment 76 * F0 0F bug workaround.. We have a special link segment
77 * for this. 77 * for this.
78 */ 78 */
79struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; 79gate_desc idt_table[256]
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80 81
81asmlinkage void divide_error(void); 82asmlinkage void divide_error(void);
82asmlinkage void debug(void); 83asmlinkage void debug(void);
@@ -101,6 +102,34 @@ asmlinkage void machine_check(void);
101int kstack_depth_to_print = 24; 102int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 103static unsigned int code_bytes = 64;
103 104
105void printk_address(unsigned long address, int reliable)
106{
107#ifdef CONFIG_KALLSYMS
108 unsigned long offset = 0, symsize;
109 const char *symname;
110 char *modname;
111 char *delim = ":";
112 char namebuf[128];
113 char reliab[4] = "";
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%08lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%08lx>]\n", address);
130#endif
131}
132
104static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) 133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
105{ 134{
106 return p > (void *)tinfo && 135 return p > (void *)tinfo &&
@@ -114,48 +143,35 @@ struct stack_frame {
114}; 143};
115 144
116static inline unsigned long print_context_stack(struct thread_info *tinfo, 145static inline unsigned long print_context_stack(struct thread_info *tinfo,
117 unsigned long *stack, unsigned long ebp, 146 unsigned long *stack, unsigned long bp,
118 const struct stacktrace_ops *ops, void *data) 147 const struct stacktrace_ops *ops, void *data)
119{ 148{
120#ifdef CONFIG_FRAME_POINTER 149 struct stack_frame *frame = (struct stack_frame *)bp;
121 struct stack_frame *frame = (struct stack_frame *)ebp;
122 while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
123 struct stack_frame *next;
124 unsigned long addr;
125 150
126 addr = frame->return_address;
127 ops->address(data, addr);
128 /*
129 * break out of recursive entries (such as
130 * end_of_stack_stop_unwind_function). Also,
131 * we can never allow a frame pointer to
132 * move downwards!
133 */
134 next = frame->next_frame;
135 if (next <= frame)
136 break;
137 frame = next;
138 }
139#else
140 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { 151 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
141 unsigned long addr; 152 unsigned long addr;
142 153
143 addr = *stack++; 154 addr = *stack;
144 if (__kernel_text_address(addr)) 155 if (__kernel_text_address(addr)) {
145 ops->address(data, addr); 156 if ((unsigned long) stack == bp + 4) {
157 ops->address(data, addr, 1);
158 frame = frame->next_frame;
159 bp = (unsigned long) frame;
160 } else {
161 ops->address(data, addr, bp == 0);
162 }
163 }
164 stack++;
146 } 165 }
147#endif 166 return bp;
148 return ebp;
149} 167}
150 168
151#define MSG(msg) ops->warning(data, msg) 169#define MSG(msg) ops->warning(data, msg)
152 170
153void dump_trace(struct task_struct *task, struct pt_regs *regs, 171void dump_trace(struct task_struct *task, struct pt_regs *regs,
154 unsigned long *stack, 172 unsigned long *stack, unsigned long bp,
155 const struct stacktrace_ops *ops, void *data) 173 const struct stacktrace_ops *ops, void *data)
156{ 174{
157 unsigned long ebp = 0;
158
159 if (!task) 175 if (!task)
160 task = current; 176 task = current;
161 177
@@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
163 unsigned long dummy; 179 unsigned long dummy;
164 stack = &dummy; 180 stack = &dummy;
165 if (task != current) 181 if (task != current)
166 stack = (unsigned long *)task->thread.esp; 182 stack = (unsigned long *)task->thread.sp;
167 } 183 }
168 184
169#ifdef CONFIG_FRAME_POINTER 185#ifdef CONFIG_FRAME_POINTER
170 if (!ebp) { 186 if (!bp) {
171 if (task == current) { 187 if (task == current) {
172 /* Grab ebp right from our regs */ 188 /* Grab bp right from our regs */
173 asm ("movl %%ebp, %0" : "=r" (ebp) : ); 189 asm ("movl %%ebp, %0" : "=r" (bp) : );
174 } else { 190 } else {
175 /* ebp is the last reg pushed by switch_to */ 191 /* bp is the last reg pushed by switch_to */
176 ebp = *(unsigned long *) task->thread.esp; 192 bp = *(unsigned long *) task->thread.sp;
177 } 193 }
178 } 194 }
179#endif 195#endif
@@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
182 struct thread_info *context; 198 struct thread_info *context;
183 context = (struct thread_info *) 199 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 200 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 ebp = print_context_stack(context, stack, ebp, ops, data); 201 bp = print_context_stack(context, stack, bp, ops, data);
186 /* Should be after the line below, but somewhere 202 /* Should be after the line below, but somewhere
187 in early boot context comes out corrupted and we 203 in early boot context comes out corrupted and we
188 can't reference it -AK */ 204 can't reference it -AK */
@@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name)
217/* 233/*
218 * Print one address/symbol entries per line. 234 * Print one address/symbol entries per line.
219 */ 235 */
220static void print_trace_address(void *data, unsigned long addr) 236static void print_trace_address(void *data, unsigned long addr, int reliable)
221{ 237{
222 printk("%s [<%08lx>] ", (char *)data, addr); 238 printk("%s [<%08lx>] ", (char *)data, addr);
239 if (!reliable)
240 printk("? ");
223 print_symbol("%s\n", addr); 241 print_symbol("%s\n", addr);
224 touch_nmi_watchdog(); 242 touch_nmi_watchdog();
225} 243}
@@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = {
233 251
234static void 252static void
235show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 253show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
236 unsigned long * stack, char *log_lvl) 254 unsigned long *stack, unsigned long bp, char *log_lvl)
237{ 255{
238 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 256 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
239 printk("%s =======================\n", log_lvl); 257 printk("%s =======================\n", log_lvl);
240} 258}
241 259
242void show_trace(struct task_struct *task, struct pt_regs *regs, 260void show_trace(struct task_struct *task, struct pt_regs *regs,
243 unsigned long * stack) 261 unsigned long *stack, unsigned long bp)
244{ 262{
245 show_trace_log_lvl(task, regs, stack, ""); 263 show_trace_log_lvl(task, regs, stack, bp, "");
246} 264}
247 265
248static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 266static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
249 unsigned long *esp, char *log_lvl) 267 unsigned long *sp, unsigned long bp, char *log_lvl)
250{ 268{
251 unsigned long *stack; 269 unsigned long *stack;
252 int i; 270 int i;
253 271
254 if (esp == NULL) { 272 if (sp == NULL) {
255 if (task) 273 if (task)
256 esp = (unsigned long*)task->thread.esp; 274 sp = (unsigned long*)task->thread.sp;
257 else 275 else
258 esp = (unsigned long *)&esp; 276 sp = (unsigned long *)&sp;
259 } 277 }
260 278
261 stack = esp; 279 stack = sp;
262 for(i = 0; i < kstack_depth_to_print; i++) { 280 for(i = 0; i < kstack_depth_to_print; i++) {
263 if (kstack_end(stack)) 281 if (kstack_end(stack))
264 break; 282 break;
@@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
267 printk("%08lx ", *stack++); 285 printk("%08lx ", *stack++);
268 } 286 }
269 printk("\n%sCall Trace:\n", log_lvl); 287 printk("\n%sCall Trace:\n", log_lvl);
270 show_trace_log_lvl(task, regs, esp, log_lvl); 288 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
271} 289}
272 290
273void show_stack(struct task_struct *task, unsigned long *esp) 291void show_stack(struct task_struct *task, unsigned long *sp)
274{ 292{
275 printk(" "); 293 printk(" ");
276 show_stack_log_lvl(task, NULL, esp, ""); 294 show_stack_log_lvl(task, NULL, sp, 0, "");
277} 295}
278 296
279/* 297/*
@@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp)
282void dump_stack(void) 300void dump_stack(void)
283{ 301{
284 unsigned long stack; 302 unsigned long stack;
303 unsigned long bp = 0;
304
305#ifdef CONFIG_FRAME_POINTER
306 if (!bp)
307 asm("movl %%ebp, %0" : "=r" (bp):);
308#endif
285 309
286 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 310 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
287 current->pid, current->comm, print_tainted(), 311 current->pid, current->comm, print_tainted(),
288 init_utsname()->release, 312 init_utsname()->release,
289 (int)strcspn(init_utsname()->version, " "), 313 (int)strcspn(init_utsname()->version, " "),
290 init_utsname()->version); 314 init_utsname()->version);
291 show_trace(current, NULL, &stack); 315 show_trace(current, NULL, &stack, bp);
292} 316}
293 317
294EXPORT_SYMBOL(dump_stack); 318EXPORT_SYMBOL(dump_stack);
@@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs)
307 * time of the fault.. 331 * time of the fault..
308 */ 332 */
309 if (!user_mode_vm(regs)) { 333 if (!user_mode_vm(regs)) {
310 u8 *eip; 334 u8 *ip;
311 unsigned int code_prologue = code_bytes * 43 / 64; 335 unsigned int code_prologue = code_bytes * 43 / 64;
312 unsigned int code_len = code_bytes; 336 unsigned int code_len = code_bytes;
313 unsigned char c; 337 unsigned char c;
314 338
315 printk("\n" KERN_EMERG "Stack: "); 339 printk("\n" KERN_EMERG "Stack: ");
316 show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG); 340 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
317 341
318 printk(KERN_EMERG "Code: "); 342 printk(KERN_EMERG "Code: ");
319 343
320 eip = (u8 *)regs->eip - code_prologue; 344 ip = (u8 *)regs->ip - code_prologue;
321 if (eip < (u8 *)PAGE_OFFSET || 345 if (ip < (u8 *)PAGE_OFFSET ||
322 probe_kernel_address(eip, c)) { 346 probe_kernel_address(ip, c)) {
323 /* try starting at EIP */ 347 /* try starting at EIP */
324 eip = (u8 *)regs->eip; 348 ip = (u8 *)regs->ip;
325 code_len = code_len - code_prologue + 1; 349 code_len = code_len - code_prologue + 1;
326 } 350 }
327 for (i = 0; i < code_len; i++, eip++) { 351 for (i = 0; i < code_len; i++, ip++) {
328 if (eip < (u8 *)PAGE_OFFSET || 352 if (ip < (u8 *)PAGE_OFFSET ||
329 probe_kernel_address(eip, c)) { 353 probe_kernel_address(ip, c)) {
330 printk(" Bad EIP value."); 354 printk(" Bad EIP value.");
331 break; 355 break;
332 } 356 }
333 if (eip == (u8 *)regs->eip) 357 if (ip == (u8 *)regs->ip)
334 printk("<%02x> ", c); 358 printk("<%02x> ", c);
335 else 359 else
336 printk("%02x ", c); 360 printk("%02x ", c);
@@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs)
339 printk("\n"); 363 printk("\n");
340} 364}
341 365
342int is_valid_bugaddr(unsigned long eip) 366int is_valid_bugaddr(unsigned long ip)
343{ 367{
344 unsigned short ud2; 368 unsigned short ud2;
345 369
346 if (eip < PAGE_OFFSET) 370 if (ip < PAGE_OFFSET)
347 return 0; 371 return 0;
348 if (probe_kernel_address((unsigned short *)eip, ud2)) 372 if (probe_kernel_address((unsigned short *)ip, ud2))
349 return 0; 373 return 0;
350 374
351 return ud2 == 0x0b0f; 375 return ud2 == 0x0b0f;
352} 376}
353 377
378static int die_counter;
379
380int __kprobes __die(const char * str, struct pt_regs * regs, long err)
381{
382 unsigned long sp;
383 unsigned short ss;
384
385 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
386#ifdef CONFIG_PREEMPT
387 printk("PREEMPT ");
388#endif
389#ifdef CONFIG_SMP
390 printk("SMP ");
391#endif
392#ifdef CONFIG_DEBUG_PAGEALLOC
393 printk("DEBUG_PAGEALLOC");
394#endif
395 printk("\n");
396
397 if (notify_die(DIE_OOPS, str, regs, err,
398 current->thread.trap_no, SIGSEGV) !=
399 NOTIFY_STOP) {
400 show_registers(regs);
401 /* Executive summary in case the oops scrolled away */
402 sp = (unsigned long) (&regs->sp);
403 savesegment(ss, ss);
404 if (user_mode(regs)) {
405 sp = regs->sp;
406 ss = regs->ss & 0xffff;
407 }
408 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
409 print_symbol("%s", regs->ip);
410 printk(" SS:ESP %04x:%08lx\n", ss, sp);
411 return 0;
412 } else {
413 return 1;
414 }
415}
416
354/* 417/*
355 * This is gone through when something in the kernel has done something bad and 418 * This is gone through when something in the kernel has done something bad and
356 * is about to be terminated. 419 * is about to be terminated.
@@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err)
366 .lock_owner = -1, 429 .lock_owner = -1,
367 .lock_owner_depth = 0 430 .lock_owner_depth = 0
368 }; 431 };
369 static int die_counter;
370 unsigned long flags; 432 unsigned long flags;
371 433
372 oops_enter(); 434 oops_enter();
@@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err)
382 raw_local_irq_save(flags); 444 raw_local_irq_save(flags);
383 445
384 if (++die.lock_owner_depth < 3) { 446 if (++die.lock_owner_depth < 3) {
385 unsigned long esp; 447 report_bug(regs->ip, regs);
386 unsigned short ss;
387 448
388 report_bug(regs->eip, regs); 449 if (__die(str, regs, err))
389
390 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
391 ++die_counter);
392#ifdef CONFIG_PREEMPT
393 printk("PREEMPT ");
394#endif
395#ifdef CONFIG_SMP
396 printk("SMP ");
397#endif
398#ifdef CONFIG_DEBUG_PAGEALLOC
399 printk("DEBUG_PAGEALLOC");
400#endif
401 printk("\n");
402
403 if (notify_die(DIE_OOPS, str, regs, err,
404 current->thread.trap_no, SIGSEGV) !=
405 NOTIFY_STOP) {
406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 esp = (unsigned long) (&regs->esp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 esp = regs->esp;
412 ss = regs->xss & 0xffff;
413 }
414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
415 print_symbol("%s", regs->eip);
416 printk(" SS:ESP %04x:%08lx\n", ss, esp);
417 }
418 else
419 regs = NULL; 450 regs = NULL;
420 } else 451 } else {
421 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 452 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
453 }
422 454
423 bust_spinlocks(0); 455 bust_spinlocks(0);
424 die.lock_owner = -1; 456 die.lock_owner = -1;
@@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
454{ 486{
455 struct task_struct *tsk = current; 487 struct task_struct *tsk = current;
456 488
457 if (regs->eflags & VM_MASK) { 489 if (regs->flags & VM_MASK) {
458 if (vm86) 490 if (vm86)
459 goto vm86_trap; 491 goto vm86_trap;
460 goto trap_signal; 492 goto trap_signal;
@@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
500} 532}
501 533
502#define DO_ERROR(trapnr, signr, str, name) \ 534#define DO_ERROR(trapnr, signr, str, name) \
503fastcall void do_##name(struct pt_regs * regs, long error_code) \ 535void do_##name(struct pt_regs * regs, long error_code) \
504{ \ 536{ \
505 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 537 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
506 == NOTIFY_STOP) \ 538 == NOTIFY_STOP) \
@@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
509} 541}
510 542
511#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 543#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
512fastcall void do_##name(struct pt_regs * regs, long error_code) \ 544void do_##name(struct pt_regs * regs, long error_code) \
513{ \ 545{ \
514 siginfo_t info; \ 546 siginfo_t info; \
515 if (irq) \ 547 if (irq) \
@@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
525} 557}
526 558
527#define DO_VM86_ERROR(trapnr, signr, str, name) \ 559#define DO_VM86_ERROR(trapnr, signr, str, name) \
528fastcall void do_##name(struct pt_regs * regs, long error_code) \ 560void do_##name(struct pt_regs * regs, long error_code) \
529{ \ 561{ \
530 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 562 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
531 == NOTIFY_STOP) \ 563 == NOTIFY_STOP) \
@@ -534,7 +566,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
534} 566}
535 567
536#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 568#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
537fastcall void do_##name(struct pt_regs * regs, long error_code) \ 569void do_##name(struct pt_regs * regs, long error_code) \
538{ \ 570{ \
539 siginfo_t info; \ 571 siginfo_t info; \
540 info.si_signo = signr; \ 572 info.si_signo = signr; \
@@ -548,13 +580,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
548 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 580 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
549} 581}
550 582
551DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) 583DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
552#ifndef CONFIG_KPROBES 584#ifndef CONFIG_KPROBES
553DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) 585DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
554#endif 586#endif
555DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) 587DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
556DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) 588DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
557DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) 589DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
558DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 590DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
559DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 591DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
560DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 592DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
@@ -562,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
562DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 594DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
563DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) 595DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
564 596
565fastcall void __kprobes do_general_protection(struct pt_regs * regs, 597void __kprobes do_general_protection(struct pt_regs * regs,
566 long error_code) 598 long error_code)
567{ 599{
568 int cpu = get_cpu(); 600 int cpu = get_cpu();
@@ -596,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
596 } 628 }
597 put_cpu(); 629 put_cpu();
598 630
599 if (regs->eflags & VM_MASK) 631 if (regs->flags & VM_MASK)
600 goto gp_in_vm86; 632 goto gp_in_vm86;
601 633
602 if (!user_mode(regs)) 634 if (!user_mode(regs))
@@ -605,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
605 current->thread.error_code = error_code; 637 current->thread.error_code = error_code;
606 current->thread.trap_no = 13; 638 current->thread.trap_no = 13;
607 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 639 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
608 printk_ratelimit()) 640 printk_ratelimit()) {
609 printk(KERN_INFO 641 printk(KERN_INFO
610 "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", 642 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
611 current->comm, task_pid_nr(current), 643 current->comm, task_pid_nr(current),
612 regs->eip, regs->esp, error_code); 644 regs->ip, regs->sp, error_code);
645 print_vma_addr(" in ", regs->ip);
646 printk("\n");
647 }
613 648
614 force_sig(SIGSEGV, current); 649 force_sig(SIGSEGV, current);
615 return; 650 return;
@@ -705,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
705 */ 740 */
706 bust_spinlocks(1); 741 bust_spinlocks(1);
707 printk(KERN_EMERG "%s", msg); 742 printk(KERN_EMERG "%s", msg);
708 printk(" on CPU%d, eip %08lx, registers:\n", 743 printk(" on CPU%d, ip %08lx, registers:\n",
709 smp_processor_id(), regs->eip); 744 smp_processor_id(), regs->ip);
710 show_registers(regs); 745 show_registers(regs);
711 console_silent(); 746 console_silent();
712 spin_unlock(&nmi_print_lock); 747 spin_unlock(&nmi_print_lock);
@@ -763,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
763 798
764static int ignore_nmis; 799static int ignore_nmis;
765 800
766fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) 801__kprobes void do_nmi(struct pt_regs * regs, long error_code)
767{ 802{
768 int cpu; 803 int cpu;
769 804
@@ -792,7 +827,7 @@ void restart_nmi(void)
792} 827}
793 828
794#ifdef CONFIG_KPROBES 829#ifdef CONFIG_KPROBES
795fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) 830void __kprobes do_int3(struct pt_regs *regs, long error_code)
796{ 831{
797 trace_hardirqs_fixup(); 832 trace_hardirqs_fixup();
798 833
@@ -828,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
828 * find every occurrence of the TF bit that could be saved away even 863 * find every occurrence of the TF bit that could be saved away even
829 * by user code) 864 * by user code)
830 */ 865 */
831fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) 866void __kprobes do_debug(struct pt_regs * regs, long error_code)
832{ 867{
833 unsigned int condition; 868 unsigned int condition;
834 struct task_struct *tsk = current; 869 struct task_struct *tsk = current;
@@ -837,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
837 872
838 get_debugreg(condition, 6); 873 get_debugreg(condition, 6);
839 874
875 /*
876 * The processor cleared BTF, so don't mark that we need it set.
877 */
878 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
879 tsk->thread.debugctlmsr = 0;
880
840 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 881 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
841 SIGTRAP) == NOTIFY_STOP) 882 SIGTRAP) == NOTIFY_STOP)
842 return; 883 return;
843 /* It's safe to allow irq's after DR6 has been saved */ 884 /* It's safe to allow irq's after DR6 has been saved */
844 if (regs->eflags & X86_EFLAGS_IF) 885 if (regs->flags & X86_EFLAGS_IF)
845 local_irq_enable(); 886 local_irq_enable();
846 887
847 /* Mask out spurious debug traps due to lazy DR7 setting */ 888 /* Mask out spurious debug traps due to lazy DR7 setting */
848 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 889 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
849 if (!tsk->thread.debugreg[7]) 890 if (!tsk->thread.debugreg7)
850 goto clear_dr7; 891 goto clear_dr7;
851 } 892 }
852 893
853 if (regs->eflags & VM_MASK) 894 if (regs->flags & VM_MASK)
854 goto debug_vm86; 895 goto debug_vm86;
855 896
856 /* Save debug status register where ptrace can see it */ 897 /* Save debug status register where ptrace can see it */
857 tsk->thread.debugreg[6] = condition; 898 tsk->thread.debugreg6 = condition;
858 899
859 /* 900 /*
860 * Single-stepping through TF: make sure we ignore any events in 901 * Single-stepping through TF: make sure we ignore any events in
@@ -886,7 +927,7 @@ debug_vm86:
886 927
887clear_TF_reenable: 928clear_TF_reenable:
888 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 929 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
889 regs->eflags &= ~TF_MASK; 930 regs->flags &= ~TF_MASK;
890 return; 931 return;
891} 932}
892 933
@@ -895,7 +936,7 @@ clear_TF_reenable:
895 * the correct behaviour even in the presence of the asynchronous 936 * the correct behaviour even in the presence of the asynchronous
896 * IRQ13 behaviour 937 * IRQ13 behaviour
897 */ 938 */
898void math_error(void __user *eip) 939void math_error(void __user *ip)
899{ 940{
900 struct task_struct * task; 941 struct task_struct * task;
901 siginfo_t info; 942 siginfo_t info;
@@ -911,7 +952,7 @@ void math_error(void __user *eip)
911 info.si_signo = SIGFPE; 952 info.si_signo = SIGFPE;
912 info.si_errno = 0; 953 info.si_errno = 0;
913 info.si_code = __SI_FAULT; 954 info.si_code = __SI_FAULT;
914 info.si_addr = eip; 955 info.si_addr = ip;
915 /* 956 /*
916 * (~cwd & swd) will mask out exceptions that are not set to unmasked 957 * (~cwd & swd) will mask out exceptions that are not set to unmasked
917 * status. 0x3f is the exception bits in these regs, 0x200 is the 958 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -954,13 +995,13 @@ void math_error(void __user *eip)
954 force_sig_info(SIGFPE, &info, task); 995 force_sig_info(SIGFPE, &info, task);
955} 996}
956 997
957fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) 998void do_coprocessor_error(struct pt_regs * regs, long error_code)
958{ 999{
959 ignore_fpu_irq = 1; 1000 ignore_fpu_irq = 1;
960 math_error((void __user *)regs->eip); 1001 math_error((void __user *)regs->ip);
961} 1002}
962 1003
963static void simd_math_error(void __user *eip) 1004static void simd_math_error(void __user *ip)
964{ 1005{
965 struct task_struct * task; 1006 struct task_struct * task;
966 siginfo_t info; 1007 siginfo_t info;
@@ -976,7 +1017,7 @@ static void simd_math_error(void __user *eip)
976 info.si_signo = SIGFPE; 1017 info.si_signo = SIGFPE;
977 info.si_errno = 0; 1018 info.si_errno = 0;
978 info.si_code = __SI_FAULT; 1019 info.si_code = __SI_FAULT;
979 info.si_addr = eip; 1020 info.si_addr = ip;
980 /* 1021 /*
981 * The SIMD FPU exceptions are handled a little differently, as there 1022 * The SIMD FPU exceptions are handled a little differently, as there
982 * is only a single status/control register. Thus, to determine which 1023 * is only a single status/control register. Thus, to determine which
@@ -1008,19 +1049,19 @@ static void simd_math_error(void __user *eip)
1008 force_sig_info(SIGFPE, &info, task); 1049 force_sig_info(SIGFPE, &info, task);
1009} 1050}
1010 1051
1011fastcall void do_simd_coprocessor_error(struct pt_regs * regs, 1052void do_simd_coprocessor_error(struct pt_regs * regs,
1012 long error_code) 1053 long error_code)
1013{ 1054{
1014 if (cpu_has_xmm) { 1055 if (cpu_has_xmm) {
1015 /* Handle SIMD FPU exceptions on PIII+ processors. */ 1056 /* Handle SIMD FPU exceptions on PIII+ processors. */
1016 ignore_fpu_irq = 1; 1057 ignore_fpu_irq = 1;
1017 simd_math_error((void __user *)regs->eip); 1058 simd_math_error((void __user *)regs->ip);
1018 } else { 1059 } else {
1019 /* 1060 /*
1020 * Handle strange cache flush from user space exception 1061 * Handle strange cache flush from user space exception
1021 * in all other cases. This is undocumented behaviour. 1062 * in all other cases. This is undocumented behaviour.
1022 */ 1063 */
1023 if (regs->eflags & VM_MASK) { 1064 if (regs->flags & VM_MASK) {
1024 handle_vm86_fault((struct kernel_vm86_regs *)regs, 1065 handle_vm86_fault((struct kernel_vm86_regs *)regs,
1025 error_code); 1066 error_code);
1026 return; 1067 return;
@@ -1032,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
1032 } 1073 }
1033} 1074}
1034 1075
1035fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, 1076void do_spurious_interrupt_bug(struct pt_regs * regs,
1036 long error_code) 1077 long error_code)
1037{ 1078{
1038#if 0 1079#if 0
@@ -1041,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1041#endif 1082#endif
1042} 1083}
1043 1084
1044fastcall unsigned long patch_espfix_desc(unsigned long uesp, 1085unsigned long patch_espfix_desc(unsigned long uesp,
1045 unsigned long kesp) 1086 unsigned long kesp)
1046{ 1087{
1047 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; 1088 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
@@ -1095,51 +1136,17 @@ asmlinkage void math_emulate(long arg)
1095 1136
1096#endif /* CONFIG_MATH_EMULATION */ 1137#endif /* CONFIG_MATH_EMULATION */
1097 1138
1098/*
1099 * This needs to use 'idt_table' rather than 'idt', and
1100 * thus use the _nonmapped_ version of the IDT, as the
1101 * Pentium F0 0F bugfix can have resulted in the mapped
1102 * IDT being write-protected.
1103 */
1104void set_intr_gate(unsigned int n, void *addr)
1105{
1106 _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
1107}
1108
1109/*
1110 * This routine sets up an interrupt gate at directory privilege level 3.
1111 */
1112static inline void set_system_intr_gate(unsigned int n, void *addr)
1113{
1114 _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
1115}
1116
1117static void __init set_trap_gate(unsigned int n, void *addr)
1118{
1119 _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
1120}
1121
1122static void __init set_system_gate(unsigned int n, void *addr)
1123{
1124 _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
1125}
1126
1127static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1128{
1129 _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
1130}
1131
1132 1139
1133void __init trap_init(void) 1140void __init trap_init(void)
1134{ 1141{
1135 int i; 1142 int i;
1136 1143
1137#ifdef CONFIG_EISA 1144#ifdef CONFIG_EISA
1138 void __iomem *p = ioremap(0x0FFFD9, 4); 1145 void __iomem *p = early_ioremap(0x0FFFD9, 4);
1139 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { 1146 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1140 EISA_bus = 1; 1147 EISA_bus = 1;
1141 } 1148 }
1142 iounmap(p); 1149 early_iounmap(p, 4);
1143#endif 1150#endif
1144 1151
1145#ifdef CONFIG_X86_LOCAL_APIC 1152#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index cc68b92316cd..efc66df728b6 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -74,22 +74,24 @@ asmlinkage void alignment_check(void);
74asmlinkage void machine_check(void); 74asmlinkage void machine_check(void);
75asmlinkage void spurious_interrupt_bug(void); 75asmlinkage void spurious_interrupt_bug(void);
76 76
77static unsigned int code_bytes = 64;
78
77static inline void conditional_sti(struct pt_regs *regs) 79static inline void conditional_sti(struct pt_regs *regs)
78{ 80{
79 if (regs->eflags & X86_EFLAGS_IF) 81 if (regs->flags & X86_EFLAGS_IF)
80 local_irq_enable(); 82 local_irq_enable();
81} 83}
82 84
83static inline void preempt_conditional_sti(struct pt_regs *regs) 85static inline void preempt_conditional_sti(struct pt_regs *regs)
84{ 86{
85 preempt_disable(); 87 preempt_disable();
86 if (regs->eflags & X86_EFLAGS_IF) 88 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_enable(); 89 local_irq_enable();
88} 90}
89 91
90static inline void preempt_conditional_cli(struct pt_regs *regs) 92static inline void preempt_conditional_cli(struct pt_regs *regs)
91{ 93{
92 if (regs->eflags & X86_EFLAGS_IF) 94 if (regs->flags & X86_EFLAGS_IF)
93 local_irq_disable(); 95 local_irq_disable();
94 /* Make sure to not schedule here because we could be running 96 /* Make sure to not schedule here because we could be running
95 on an exception stack. */ 97 on an exception stack. */
@@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
98 100
99int kstack_depth_to_print = 12; 101int kstack_depth_to_print = 12;
100 102
101#ifdef CONFIG_KALLSYMS 103void printk_address(unsigned long address, int reliable)
102void printk_address(unsigned long address)
103{ 104{
105#ifdef CONFIG_KALLSYMS
104 unsigned long offset = 0, symsize; 106 unsigned long offset = 0, symsize;
105 const char *symname; 107 const char *symname;
106 char *modname; 108 char *modname;
107 char *delim = ":"; 109 char *delim = ":";
108 char namebuf[128]; 110 char namebuf[KSYM_NAME_LEN];
111 char reliab[4] = "";
109 112
110 symname = kallsyms_lookup(address, &symsize, &offset, 113 symname = kallsyms_lookup(address, &symsize, &offset,
111 &modname, namebuf); 114 &modname, namebuf);
@@ -113,17 +116,17 @@ void printk_address(unsigned long address)
113 printk(" [<%016lx>]\n", address); 116 printk(" [<%016lx>]\n", address);
114 return; 117 return;
115 } 118 }
119 if (!reliable)
120 strcpy(reliab, "? ");
121
116 if (!modname) 122 if (!modname)
117 modname = delim = ""; 123 modname = delim = "";
118 printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", 124 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
119 address, delim, modname, delim, symname, offset, symsize); 125 address, reliab, delim, modname, delim, symname, offset, symsize);
120}
121#else 126#else
122void printk_address(unsigned long address)
123{
124 printk(" [<%016lx>]\n", address); 127 printk(" [<%016lx>]\n", address);
125}
126#endif 128#endif
129}
127 130
128static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 131static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
129 unsigned *usedp, char **idp) 132 unsigned *usedp, char **idp)
@@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
208 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 211 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
209 */ 212 */
210 213
211static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) 214static inline int valid_stack_ptr(struct thread_info *tinfo,
215 void *p, unsigned int size, void *end)
216{
217 void *t = tinfo;
218 if (end) {
219 if (p < end && p >= (end-THREAD_SIZE))
220 return 1;
221 else
222 return 0;
223 }
224 return p > t && p < t + THREAD_SIZE - size;
225}
226
227/* The form of the top of the frame on the stack */
228struct stack_frame {
229 struct stack_frame *next_frame;
230 unsigned long return_address;
231};
232
233
234static inline unsigned long print_context_stack(struct thread_info *tinfo,
235 unsigned long *stack, unsigned long bp,
236 const struct stacktrace_ops *ops, void *data,
237 unsigned long *end)
212{ 238{
213 void *t = (void *)tinfo; 239 struct stack_frame *frame = (struct stack_frame *)bp;
214 return p > t && p < t + THREAD_SIZE - 3; 240
241 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
242 unsigned long addr;
243
244 addr = *stack;
245 if (__kernel_text_address(addr)) {
246 if ((unsigned long) stack == bp + 8) {
247 ops->address(data, addr, 1);
248 frame = frame->next_frame;
249 bp = (unsigned long) frame;
250 } else {
251 ops->address(data, addr, bp == 0);
252 }
253 }
254 stack++;
255 }
256 return bp;
215} 257}
216 258
217void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 259void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
218 unsigned long *stack, 260 unsigned long *stack, unsigned long bp,
219 const struct stacktrace_ops *ops, void *data) 261 const struct stacktrace_ops *ops, void *data)
220{ 262{
221 const unsigned cpu = get_cpu(); 263 const unsigned cpu = get_cpu();
@@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
225 267
226 if (!tsk) 268 if (!tsk)
227 tsk = current; 269 tsk = current;
270 tinfo = task_thread_info(tsk);
228 271
229 if (!stack) { 272 if (!stack) {
230 unsigned long dummy; 273 unsigned long dummy;
231 stack = &dummy; 274 stack = &dummy;
232 if (tsk && tsk != current) 275 if (tsk && tsk != current)
233 stack = (unsigned long *)tsk->thread.rsp; 276 stack = (unsigned long *)tsk->thread.sp;
234 } 277 }
235 278
236 /* 279#ifdef CONFIG_FRAME_POINTER
237 * Print function call entries within a stack. 'cond' is the 280 if (!bp) {
238 * "end of stackframe" condition, that the 'stack++' 281 if (tsk == current) {
239 * iteration will eventually trigger. 282 /* Grab bp right from our regs */
240 */ 283 asm("movq %%rbp, %0" : "=r" (bp):);
241#define HANDLE_STACK(cond) \ 284 } else {
242 do while (cond) { \ 285 /* bp is the last reg pushed by switch_to */
243 unsigned long addr = *stack++; \ 286 bp = *(unsigned long *) tsk->thread.sp;
244 /* Use unlocked access here because except for NMIs \ 287 }
245 we should be already protected against module unloads */ \ 288 }
246 if (__kernel_text_address(addr)) { \ 289#endif
247 /* \ 290
248 * If the address is either in the text segment of the \ 291
249 * kernel, or in the region which contains vmalloc'ed \
250 * memory, it *may* be the address of a calling \
251 * routine; if so, print it so that someone tracing \
252 * down the cause of the crash will be able to figure \
253 * out the call path that was taken. \
254 */ \
255 ops->address(data, addr); \
256 } \
257 } while (0)
258 292
259 /* 293 /*
260 * Print function call entries in all stacks, starting at the 294 * Print function call entries in all stacks, starting at the
@@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
270 if (estack_end) { 304 if (estack_end) {
271 if (ops->stack(data, id) < 0) 305 if (ops->stack(data, id) < 0)
272 break; 306 break;
273 HANDLE_STACK (stack < estack_end); 307
308 bp = print_context_stack(tinfo, stack, bp, ops,
309 data, estack_end);
274 ops->stack(data, "<EOE>"); 310 ops->stack(data, "<EOE>");
275 /* 311 /*
276 * We link to the next stack via the 312 * We link to the next stack via the
@@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
288 if (stack >= irqstack && stack < irqstack_end) { 324 if (stack >= irqstack && stack < irqstack_end) {
289 if (ops->stack(data, "IRQ") < 0) 325 if (ops->stack(data, "IRQ") < 0)
290 break; 326 break;
291 HANDLE_STACK (stack < irqstack_end); 327 bp = print_context_stack(tinfo, stack, bp,
328 ops, data, irqstack_end);
292 /* 329 /*
293 * We link to the next stack (which would be 330 * We link to the next stack (which would be
294 * the process stack normally) the last 331 * the process stack normally) the last
@@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
306 /* 343 /*
307 * This handles the process stack: 344 * This handles the process stack:
308 */ 345 */
309 tinfo = task_thread_info(tsk); 346 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
310 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
311#undef HANDLE_STACK
312 put_cpu(); 347 put_cpu();
313} 348}
314EXPORT_SYMBOL(dump_trace); 349EXPORT_SYMBOL(dump_trace);
@@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name)
331 return 0; 366 return 0;
332} 367}
333 368
334static void print_trace_address(void *data, unsigned long addr) 369static void print_trace_address(void *data, unsigned long addr, int reliable)
335{ 370{
336 touch_nmi_watchdog(); 371 touch_nmi_watchdog();
337 printk_address(addr); 372 printk_address(addr, reliable);
338} 373}
339 374
340static const struct stacktrace_ops print_trace_ops = { 375static const struct stacktrace_ops print_trace_ops = {
@@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = {
345}; 380};
346 381
347void 382void
348show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) 383show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
384 unsigned long bp)
349{ 385{
350 printk("\nCall Trace:\n"); 386 printk("\nCall Trace:\n");
351 dump_trace(tsk, regs, stack, &print_trace_ops, NULL); 387 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
352 printk("\n"); 388 printk("\n");
353} 389}
354 390
355static void 391static void
356_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) 392_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
393 unsigned long bp)
357{ 394{
358 unsigned long *stack; 395 unsigned long *stack;
359 int i; 396 int i;
@@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
364 // debugging aid: "show_stack(NULL, NULL);" prints the 401 // debugging aid: "show_stack(NULL, NULL);" prints the
365 // back trace for this cpu. 402 // back trace for this cpu.
366 403
367 if (rsp == NULL) { 404 if (sp == NULL) {
368 if (tsk) 405 if (tsk)
369 rsp = (unsigned long *)tsk->thread.rsp; 406 sp = (unsigned long *)tsk->thread.sp;
370 else 407 else
371 rsp = (unsigned long *)&rsp; 408 sp = (unsigned long *)&sp;
372 } 409 }
373 410
374 stack = rsp; 411 stack = sp;
375 for(i=0; i < kstack_depth_to_print; i++) { 412 for(i=0; i < kstack_depth_to_print; i++) {
376 if (stack >= irqstack && stack <= irqstack_end) { 413 if (stack >= irqstack && stack <= irqstack_end) {
377 if (stack == irqstack_end) { 414 if (stack == irqstack_end) {
@@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
387 printk(" %016lx", *stack++); 424 printk(" %016lx", *stack++);
388 touch_nmi_watchdog(); 425 touch_nmi_watchdog();
389 } 426 }
390 show_trace(tsk, regs, rsp); 427 show_trace(tsk, regs, sp, bp);
391} 428}
392 429
393void show_stack(struct task_struct *tsk, unsigned long * rsp) 430void show_stack(struct task_struct *tsk, unsigned long * sp)
394{ 431{
395 _show_stack(tsk, NULL, rsp); 432 _show_stack(tsk, NULL, sp, 0);
396} 433}
397 434
398/* 435/*
@@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
401void dump_stack(void) 438void dump_stack(void)
402{ 439{
403 unsigned long dummy; 440 unsigned long dummy;
441 unsigned long bp = 0;
442
443#ifdef CONFIG_FRAME_POINTER
444 if (!bp)
445 asm("movq %%rbp, %0" : "=r" (bp):);
446#endif
404 447
405 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 448 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
406 current->pid, current->comm, print_tainted(), 449 current->pid, current->comm, print_tainted(),
407 init_utsname()->release, 450 init_utsname()->release,
408 (int)strcspn(init_utsname()->version, " "), 451 (int)strcspn(init_utsname()->version, " "),
409 init_utsname()->version); 452 init_utsname()->version);
410 show_trace(NULL, NULL, &dummy); 453 show_trace(NULL, NULL, &dummy, bp);
411} 454}
412 455
413EXPORT_SYMBOL(dump_stack); 456EXPORT_SYMBOL(dump_stack);
@@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack);
415void show_registers(struct pt_regs *regs) 458void show_registers(struct pt_regs *regs)
416{ 459{
417 int i; 460 int i;
418 int in_kernel = !user_mode(regs); 461 unsigned long sp;
419 unsigned long rsp;
420 const int cpu = smp_processor_id(); 462 const int cpu = smp_processor_id();
421 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 463 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
464 u8 *ip;
465 unsigned int code_prologue = code_bytes * 43 / 64;
466 unsigned int code_len = code_bytes;
422 467
423 rsp = regs->rsp; 468 sp = regs->sp;
469 ip = (u8 *) regs->ip - code_prologue;
424 printk("CPU %d ", cpu); 470 printk("CPU %d ", cpu);
425 __show_regs(regs); 471 __show_regs(regs);
426 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 472 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs)
430 * When in-kernel, we also print out the stack and code at the 476 * When in-kernel, we also print out the stack and code at the
431 * time of the fault.. 477 * time of the fault..
432 */ 478 */
433 if (in_kernel) { 479 if (!user_mode(regs)) {
480 unsigned char c;
434 printk("Stack: "); 481 printk("Stack: ");
435 _show_stack(NULL, regs, (unsigned long*)rsp); 482 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
436 483 printk("\n");
437 printk("\nCode: "); 484
438 if (regs->rip < PAGE_OFFSET) 485 printk(KERN_EMERG "Code: ");
439 goto bad; 486 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
440 487 /* try starting at RIP */
441 for (i=0; i<20; i++) { 488 ip = (u8 *) regs->ip;
442 unsigned char c; 489 code_len = code_len - code_prologue + 1;
443 if (__get_user(c, &((unsigned char*)regs->rip)[i])) { 490 }
444bad: 491 for (i = 0; i < code_len; i++, ip++) {
492 if (ip < (u8 *)PAGE_OFFSET ||
493 probe_kernel_address(ip, c)) {
445 printk(" Bad RIP value."); 494 printk(" Bad RIP value.");
446 break; 495 break;
447 } 496 }
448 printk("%02x ", c); 497 if (ip == (u8 *)regs->ip)
498 printk("<%02x> ", c);
499 else
500 printk("%02x ", c);
449 } 501 }
450 } 502 }
451 printk("\n"); 503 printk("\n");
452} 504}
453 505
454int is_valid_bugaddr(unsigned long rip) 506int is_valid_bugaddr(unsigned long ip)
455{ 507{
456 unsigned short ud2; 508 unsigned short ud2;
457 509
458 if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) 510 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
459 return 0; 511 return 0;
460 512
461 return ud2 == 0x0b0f; 513 return ud2 == 0x0b0f;
462} 514}
463 515
464#ifdef CONFIG_BUG
465void out_of_line_bug(void)
466{
467 BUG();
468}
469EXPORT_SYMBOL(out_of_line_bug);
470#endif
471
472static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 516static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
473static int die_owner = -1; 517static int die_owner = -1;
474static unsigned int die_nest_count; 518static unsigned int die_nest_count;
@@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void)
496 return flags; 540 return flags;
497} 541}
498 542
499void __kprobes oops_end(unsigned long flags) 543void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
500{ 544{
501 die_owner = -1; 545 die_owner = -1;
502 bust_spinlocks(0); 546 bust_spinlocks(0);
@@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags)
505 /* Nest count reaches zero, release the lock. */ 549 /* Nest count reaches zero, release the lock. */
506 __raw_spin_unlock(&die_lock); 550 __raw_spin_unlock(&die_lock);
507 raw_local_irq_restore(flags); 551 raw_local_irq_restore(flags);
552 if (!regs) {
553 oops_exit();
554 return;
555 }
508 if (panic_on_oops) 556 if (panic_on_oops)
509 panic("Fatal exception"); 557 panic("Fatal exception");
510 oops_exit(); 558 oops_exit();
559 do_exit(signr);
511} 560}
512 561
513void __kprobes __die(const char * str, struct pt_regs * regs, long err) 562int __kprobes __die(const char * str, struct pt_regs * regs, long err)
514{ 563{
515 static int die_counter; 564 static int die_counter;
516 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); 565 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
524 printk("DEBUG_PAGEALLOC"); 573 printk("DEBUG_PAGEALLOC");
525#endif 574#endif
526 printk("\n"); 575 printk("\n");
527 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); 576 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
577 return 1;
528 show_registers(regs); 578 show_registers(regs);
529 add_taint(TAINT_DIE); 579 add_taint(TAINT_DIE);
530 /* Executive summary in case the oops scrolled away */ 580 /* Executive summary in case the oops scrolled away */
531 printk(KERN_ALERT "RIP "); 581 printk(KERN_ALERT "RIP ");
532 printk_address(regs->rip); 582 printk_address(regs->ip, 1);
533 printk(" RSP <%016lx>\n", regs->rsp); 583 printk(" RSP <%016lx>\n", regs->sp);
534 if (kexec_should_crash(current)) 584 if (kexec_should_crash(current))
535 crash_kexec(regs); 585 crash_kexec(regs);
586 return 0;
536} 587}
537 588
538void die(const char * str, struct pt_regs * regs, long err) 589void die(const char * str, struct pt_regs * regs, long err)
@@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err)
540 unsigned long flags = oops_begin(); 591 unsigned long flags = oops_begin();
541 592
542 if (!user_mode(regs)) 593 if (!user_mode(regs))
543 report_bug(regs->rip, regs); 594 report_bug(regs->ip, regs);
544 595
545 __die(str, regs, err); 596 if (__die(str, regs, err))
546 oops_end(flags); 597 regs = NULL;
547 do_exit(SIGSEGV); 598 oops_end(flags, regs, SIGSEGV);
548} 599}
549 600
550void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) 601void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
@@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
561 crash_kexec(regs); 612 crash_kexec(regs);
562 if (do_panic || panic_on_oops) 613 if (do_panic || panic_on_oops)
563 panic("Non maskable interrupt"); 614 panic("Non maskable interrupt");
564 oops_end(flags); 615 oops_end(flags, NULL, SIGBUS);
565 nmi_exit(); 616 nmi_exit();
566 local_irq_enable(); 617 local_irq_enable();
567 do_exit(SIGSEGV); 618 do_exit(SIGBUS);
568} 619}
569 620
570static void __kprobes do_trap(int trapnr, int signr, char *str, 621static void __kprobes do_trap(int trapnr, int signr, char *str,
@@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
588 tsk->thread.trap_no = trapnr; 639 tsk->thread.trap_no = trapnr;
589 640
590 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 641 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
591 printk_ratelimit()) 642 printk_ratelimit()) {
592 printk(KERN_INFO 643 printk(KERN_INFO
593 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 644 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
594 tsk->comm, tsk->pid, str, 645 tsk->comm, tsk->pid, str,
595 regs->rip, regs->rsp, error_code); 646 regs->ip, regs->sp, error_code);
647 print_vma_addr(" in ", regs->ip);
648 printk("\n");
649 }
596 650
597 if (info) 651 if (info)
598 force_sig_info(signr, info, tsk); 652 force_sig_info(signr, info, tsk);
@@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
602 } 656 }
603 657
604 658
605 /* kernel trap */ 659 if (!fixup_exception(regs)) {
606 { 660 tsk->thread.error_code = error_code;
607 const struct exception_table_entry *fixup; 661 tsk->thread.trap_no = trapnr;
608 fixup = search_exception_tables(regs->rip); 662 die(str, regs, error_code);
609 if (fixup)
610 regs->rip = fixup->fixup;
611 else {
612 tsk->thread.error_code = error_code;
613 tsk->thread.trap_no = trapnr;
614 die(str, regs, error_code);
615 }
616 return;
617 } 663 }
664 return;
618} 665}
619 666
620#define DO_ERROR(trapnr, signr, str, name) \ 667#define DO_ERROR(trapnr, signr, str, name) \
@@ -643,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
643 do_trap(trapnr, signr, str, regs, error_code, &info); \ 690 do_trap(trapnr, signr, str, regs, error_code, &info); \
644} 691}
645 692
646DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) 693DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
647DO_ERROR( 4, SIGSEGV, "overflow", overflow) 694DO_ERROR( 4, SIGSEGV, "overflow", overflow)
648DO_ERROR( 5, SIGSEGV, "bounds", bounds) 695DO_ERROR( 5, SIGSEGV, "bounds", bounds)
649DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) 696DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
650DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 697DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
651DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 698DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
652DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 699DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -694,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
694 tsk->thread.trap_no = 13; 741 tsk->thread.trap_no = 13;
695 742
696 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 743 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
697 printk_ratelimit()) 744 printk_ratelimit()) {
698 printk(KERN_INFO 745 printk(KERN_INFO
699 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 746 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
700 tsk->comm, tsk->pid, 747 tsk->comm, tsk->pid,
701 regs->rip, regs->rsp, error_code); 748 regs->ip, regs->sp, error_code);
749 print_vma_addr(" in ", regs->ip);
750 printk("\n");
751 }
702 752
703 force_sig(SIGSEGV, tsk); 753 force_sig(SIGSEGV, tsk);
704 return; 754 return;
705 } 755 }
706 756
707 /* kernel gp */ 757 if (fixup_exception(regs))
708 { 758 return;
709 const struct exception_table_entry *fixup;
710 fixup = search_exception_tables(regs->rip);
711 if (fixup) {
712 regs->rip = fixup->fixup;
713 return;
714 }
715 759
716 tsk->thread.error_code = error_code; 760 tsk->thread.error_code = error_code;
717 tsk->thread.trap_no = 13; 761 tsk->thread.trap_no = 13;
718 if (notify_die(DIE_GPF, "general protection fault", regs, 762 if (notify_die(DIE_GPF, "general protection fault", regs,
719 error_code, 13, SIGSEGV) == NOTIFY_STOP) 763 error_code, 13, SIGSEGV) == NOTIFY_STOP)
720 return; 764 return;
721 die("general protection fault", regs, error_code); 765 die("general protection fault", regs, error_code);
722 }
723} 766}
724 767
725static __kprobes void 768static __kprobes void
@@ -832,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
832{ 875{
833 struct pt_regs *regs = eregs; 876 struct pt_regs *regs = eregs;
834 /* Did already sync */ 877 /* Did already sync */
835 if (eregs == (struct pt_regs *)eregs->rsp) 878 if (eregs == (struct pt_regs *)eregs->sp)
836 ; 879 ;
837 /* Exception from user space */ 880 /* Exception from user space */
838 else if (user_mode(eregs)) 881 else if (user_mode(eregs))
839 regs = task_pt_regs(current); 882 regs = task_pt_regs(current);
840 /* Exception from kernel and interrupts are enabled. Move to 883 /* Exception from kernel and interrupts are enabled. Move to
841 kernel process stack. */ 884 kernel process stack. */
842 else if (eregs->eflags & X86_EFLAGS_IF) 885 else if (eregs->flags & X86_EFLAGS_IF)
843 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); 886 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
844 if (eregs != regs) 887 if (eregs != regs)
845 *regs = *eregs; 888 *regs = *eregs;
846 return regs; 889 return regs;
@@ -858,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
858 901
859 get_debugreg(condition, 6); 902 get_debugreg(condition, 6);
860 903
904 /*
905 * The processor cleared BTF, so don't mark that we need it set.
906 */
907 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
908 tsk->thread.debugctlmsr = 0;
909
861 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 910 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
862 SIGTRAP) == NOTIFY_STOP) 911 SIGTRAP) == NOTIFY_STOP)
863 return; 912 return;
@@ -873,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
873 922
874 tsk->thread.debugreg6 = condition; 923 tsk->thread.debugreg6 = condition;
875 924
876 /* Mask out spurious TF errors due to lazy TF clearing */ 925
926 /*
927 * Single-stepping through TF: make sure we ignore any events in
928 * kernel space (but re-enable TF when returning to user mode).
929 */
877 if (condition & DR_STEP) { 930 if (condition & DR_STEP) {
878 /*
879 * The TF error should be masked out only if the current
880 * process is not traced and if the TRAP flag has been set
881 * previously by a tracing process (condition detected by
882 * the PT_DTRACE flag); remember that the i386 TRAP flag
883 * can be modified by the process itself in user mode,
884 * allowing programs to debug themselves without the ptrace()
885 * interface.
886 */
887 if (!user_mode(regs)) 931 if (!user_mode(regs))
888 goto clear_TF_reenable; 932 goto clear_TF_reenable;
889 /*
890 * Was the TF flag set by a debugger? If so, clear it now,
891 * so that register information is correct.
892 */
893 if (tsk->ptrace & PT_DTRACE) {
894 regs->eflags &= ~TF_MASK;
895 tsk->ptrace &= ~PT_DTRACE;
896 }
897 } 933 }
898 934
899 /* Ok, finally something we can handle */ 935 /* Ok, finally something we can handle */
@@ -902,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
902 info.si_signo = SIGTRAP; 938 info.si_signo = SIGTRAP;
903 info.si_errno = 0; 939 info.si_errno = 0;
904 info.si_code = TRAP_BRKPT; 940 info.si_code = TRAP_BRKPT;
905 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; 941 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
906 force_sig_info(SIGTRAP, &info, tsk); 942 force_sig_info(SIGTRAP, &info, tsk);
907 943
908clear_dr7: 944clear_dr7:
@@ -912,18 +948,15 @@ clear_dr7:
912 948
913clear_TF_reenable: 949clear_TF_reenable:
914 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 950 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
915 regs->eflags &= ~TF_MASK; 951 regs->flags &= ~X86_EFLAGS_TF;
916 preempt_conditional_cli(regs); 952 preempt_conditional_cli(regs);
917} 953}
918 954
919static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 955static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
920{ 956{
921 const struct exception_table_entry *fixup; 957 if (fixup_exception(regs))
922 fixup = search_exception_tables(regs->rip);
923 if (fixup) {
924 regs->rip = fixup->fixup;
925 return 1; 958 return 1;
926 } 959
927 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); 960 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
928 /* Illegal floating point operation in the kernel */ 961 /* Illegal floating point operation in the kernel */
929 current->thread.trap_no = trapnr; 962 current->thread.trap_no = trapnr;
@@ -938,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
938 */ 971 */
939asmlinkage void do_coprocessor_error(struct pt_regs *regs) 972asmlinkage void do_coprocessor_error(struct pt_regs *regs)
940{ 973{
941 void __user *rip = (void __user *)(regs->rip); 974 void __user *ip = (void __user *)(regs->ip);
942 struct task_struct * task; 975 struct task_struct * task;
943 siginfo_t info; 976 siginfo_t info;
944 unsigned short cwd, swd; 977 unsigned short cwd, swd;
@@ -958,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
958 info.si_signo = SIGFPE; 991 info.si_signo = SIGFPE;
959 info.si_errno = 0; 992 info.si_errno = 0;
960 info.si_code = __SI_FAULT; 993 info.si_code = __SI_FAULT;
961 info.si_addr = rip; 994 info.si_addr = ip;
962 /* 995 /*
963 * (~cwd & swd) will mask out exceptions that are not set to unmasked 996 * (~cwd & swd) will mask out exceptions that are not set to unmasked
964 * status. 0x3f is the exception bits in these regs, 0x200 is the 997 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -1007,7 +1040,7 @@ asmlinkage void bad_intr(void)
1007 1040
1008asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1009{ 1042{
1010 void __user *rip = (void __user *)(regs->rip); 1043 void __user *ip = (void __user *)(regs->ip);
1011 struct task_struct * task; 1044 struct task_struct * task;
1012 siginfo_t info; 1045 siginfo_t info;
1013 unsigned short mxcsr; 1046 unsigned short mxcsr;
@@ -1027,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1027 info.si_signo = SIGFPE; 1060 info.si_signo = SIGFPE;
1028 info.si_errno = 0; 1061 info.si_errno = 0;
1029 info.si_code = __SI_FAULT; 1062 info.si_code = __SI_FAULT;
1030 info.si_addr = rip; 1063 info.si_addr = ip;
1031 /* 1064 /*
1032 * The SIMD FPU exceptions are handled a little differently, as there 1065 * The SIMD FPU exceptions are handled a little differently, as there
1033 * is only a single status/control register. Thus, to determine which 1066 * is only a single status/control register. Thus, to determine which
@@ -1089,6 +1122,7 @@ asmlinkage void math_state_restore(void)
1089 task_thread_info(me)->status |= TS_USEDFPU; 1122 task_thread_info(me)->status |= TS_USEDFPU;
1090 me->fpu_counter++; 1123 me->fpu_counter++;
1091} 1124}
1125EXPORT_SYMBOL_GPL(math_state_restore);
1092 1126
1093void __init trap_init(void) 1127void __init trap_init(void)
1094{ 1128{
@@ -1144,3 +1178,14 @@ static int __init kstack_setup(char *s)
1144 return 0; 1178 return 0;
1145} 1179}
1146early_param("kstack", kstack_setup); 1180early_param("kstack", kstack_setup);
1181
1182
1183static int __init code_bytes_setup(char *s)
1184{
1185 code_bytes = simple_strtoul(s, NULL, 0);
1186 if (code_bytes > 8192)
1187 code_bytes = 8192;
1188
1189 return 1;
1190}
1191__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 9ebc0dab66b4..43517e324be8 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -5,6 +5,7 @@
5#include <linux/jiffies.h> 5#include <linux/jiffies.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmi.h> 7#include <linux/dmi.h>
8#include <linux/percpu.h>
8 9
9#include <asm/delay.h> 10#include <asm/delay.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -23,8 +24,6 @@ static int tsc_enabled;
23unsigned int tsc_khz; 24unsigned int tsc_khz;
24EXPORT_SYMBOL_GPL(tsc_khz); 25EXPORT_SYMBOL_GPL(tsc_khz);
25 26
26int tsc_disable;
27
28#ifdef CONFIG_X86_TSC 27#ifdef CONFIG_X86_TSC
29static int __init tsc_setup(char *str) 28static int __init tsc_setup(char *str)
30{ 29{
@@ -39,8 +38,7 @@ static int __init tsc_setup(char *str)
39 */ 38 */
40static int __init tsc_setup(char *str) 39static int __init tsc_setup(char *str)
41{ 40{
42 tsc_disable = 1; 41 setup_clear_cpu_cap(X86_FEATURE_TSC);
43
44 return 1; 42 return 1;
45} 43}
46#endif 44#endif
@@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
80 * 78 *
81 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 79 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
82 */ 80 */
83unsigned long cyc2ns_scale __read_mostly;
84 81
85#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 82DEFINE_PER_CPU(unsigned long, cyc2ns);
86 83
87static inline void set_cyc2ns_scale(unsigned long cpu_khz) 84static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
88{ 85{
89 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; 86 unsigned long flags, prev_scale, *scale;
87 unsigned long long tsc_now, ns_now;
88
89 local_irq_save(flags);
90 sched_clock_idle_sleep_event();
91
92 scale = &per_cpu(cyc2ns, cpu);
93
94 rdtscll(tsc_now);
95 ns_now = __cycles_2_ns(tsc_now);
96
97 prev_scale = *scale;
98 if (cpu_khz)
99 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
100
101 /*
102 * Start smoothly with the new frequency:
103 */
104 sched_clock_idle_wakeup_event(0);
105 local_irq_restore(flags);
90} 106}
91 107
92/* 108/*
@@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
239 ref_freq, freq->new); 255 ref_freq, freq->new);
240 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { 256 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
241 tsc_khz = cpu_khz; 257 tsc_khz = cpu_khz;
242 set_cyc2ns_scale(cpu_khz); 258 preempt_disable();
259 set_cyc2ns_scale(cpu_khz, smp_processor_id());
260 preempt_enable();
243 /* 261 /*
244 * TSC based sched_clock turns 262 * TSC based sched_clock turns
245 * to junk w/ cpufreq 263 * to junk w/ cpufreq
@@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void)
333{ 351{
334 if (!cpu_has_tsc || tsc_unstable) 352 if (!cpu_has_tsc || tsc_unstable)
335 return 1; 353 return 1;
354
355 /* Anything with constant TSC should be synchronized */
356 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
357 return 0;
358
336 /* 359 /*
337 * Intel systems are normally all synchronized. 360 * Intel systems are normally all synchronized.
338 * Exceptions must mark TSC as unstable: 361 * Exceptions must mark TSC as unstable:
@@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { }
367 390
368void __init tsc_init(void) 391void __init tsc_init(void)
369{ 392{
370 if (!cpu_has_tsc || tsc_disable) 393 int cpu;
394
395 if (!cpu_has_tsc)
371 goto out_no_tsc; 396 goto out_no_tsc;
372 397
373 cpu_khz = calculate_cpu_khz(); 398 cpu_khz = calculate_cpu_khz();
@@ -380,7 +405,15 @@ void __init tsc_init(void)
380 (unsigned long)cpu_khz / 1000, 405 (unsigned long)cpu_khz / 1000,
381 (unsigned long)cpu_khz % 1000); 406 (unsigned long)cpu_khz % 1000);
382 407
383 set_cyc2ns_scale(cpu_khz); 408 /*
409 * Secondary CPUs do not run through tsc_init(), so set up
410 * all the scale factors for all CPUs, assuming the same
411 * speed as the bootup CPU. (cpufreq notifiers will fix this
412 * up if their speed diverges)
413 */
414 for_each_possible_cpu(cpu)
415 set_cyc2ns_scale(cpu_khz, cpu);
416
384 use_tsc_delay(); 417 use_tsc_delay();
385 418
386 /* Check and install the TSC clocksource */ 419 /* Check and install the TSC clocksource */
@@ -403,10 +436,5 @@ void __init tsc_init(void)
403 return; 436 return;
404 437
405out_no_tsc: 438out_no_tsc:
406 /* 439 setup_clear_cpu_cap(X86_FEATURE_TSC);
407 * Set the tsc_disable flag if there's no TSC support, this
408 * makes it a fast flag for the kernel to see whether it
409 * should be using the TSC.
410 */
411 tsc_disable = 1;
412} 440}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9c70af45b42b..947554ddabb6 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -10,6 +10,7 @@
10 10
11#include <asm/hpet.h> 11#include <asm/hpet.h>
12#include <asm/timex.h> 12#include <asm/timex.h>
13#include <asm/timer.h>
13 14
14static int notsc __initdata = 0; 15static int notsc __initdata = 0;
15 16
@@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz);
18unsigned int tsc_khz; 19unsigned int tsc_khz;
19EXPORT_SYMBOL(tsc_khz); 20EXPORT_SYMBOL(tsc_khz);
20 21
21static unsigned int cyc2ns_scale __read_mostly; 22/* Accelerators for sched_clock()
23 * convert from cycles(64bits) => nanoseconds (64bits)
24 * basic equation:
25 * ns = cycles / (freq / ns_per_sec)
26 * ns = cycles * (ns_per_sec / freq)
27 * ns = cycles * (10^9 / (cpu_khz * 10^3))
28 * ns = cycles * (10^6 / cpu_khz)
29 *
30 * Then we use scaling math (suggested by george@mvista.com) to get:
31 * ns = cycles * (10^6 * SC / cpu_khz) / SC
32 * ns = cycles * cyc2ns_scale / SC
33 *
34 * And since SC is a constant power of two, we can convert the div
35 * into a shift.
36 *
37 * We can use khz divisor instead of mhz to keep a better precision, since
38 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
39 * (mathieu.desnoyers@polymtl.ca)
40 *
41 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
42 */
43DEFINE_PER_CPU(unsigned long, cyc2ns);
22 44
23static inline void set_cyc2ns_scale(unsigned long khz) 45static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
24{ 46{
25 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; 47 unsigned long flags, prev_scale, *scale;
26} 48 unsigned long long tsc_now, ns_now;
27 49
28static unsigned long long cycles_2_ns(unsigned long long cyc) 50 local_irq_save(flags);
29{ 51 sched_clock_idle_sleep_event();
30 return (cyc * cyc2ns_scale) >> NS_SCALE; 52
53 scale = &per_cpu(cyc2ns, cpu);
54
55 rdtscll(tsc_now);
56 ns_now = __cycles_2_ns(tsc_now);
57
58 prev_scale = *scale;
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
31} 64}
32 65
33unsigned long long sched_clock(void) 66unsigned long long native_sched_clock(void)
34{ 67{
35 unsigned long a = 0; 68 unsigned long a = 0;
36 69
@@ -44,12 +77,27 @@ unsigned long long sched_clock(void)
44 return cycles_2_ns(a); 77 return cycles_2_ns(a);
45} 78}
46 79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
47static int tsc_unstable; 93static int tsc_unstable;
48 94
49inline int check_tsc_unstable(void) 95int check_tsc_unstable(void)
50{ 96{
51 return tsc_unstable; 97 return tsc_unstable;
52} 98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
53#ifdef CONFIG_CPU_FREQ 101#ifdef CONFIG_CPU_FREQ
54 102
55/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency 103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
100 mark_tsc_unstable("cpufreq changes"); 148 mark_tsc_unstable("cpufreq changes");
101 } 149 }
102 150
103 set_cyc2ns_scale(tsc_khz_ref); 151 preempt_disable();
152 set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
153 preempt_enable();
104 154
105 return 0; 155 return 0;
106} 156}
@@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
133 int i; 183 int i;
134 184
135 for (i = 0; i < MAX_RETRIES; i++) { 185 for (i = 0; i < MAX_RETRIES; i++) {
136 t1 = get_cycles_sync(); 186 t1 = get_cycles();
137 if (hpet) 187 if (hpet)
138 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 188 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
139 else 189 else
140 *pm = acpi_pm_read_early(); 190 *pm = acpi_pm_read_early();
141 t2 = get_cycles_sync(); 191 t2 = get_cycles();
142 if ((t2 - t1) < SMI_TRESHOLD) 192 if ((t2 - t1) < SMI_TRESHOLD)
143 return t2; 193 return t2;
144 } 194 }
@@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
151void __init tsc_calibrate(void) 201void __init tsc_calibrate(void)
152{ 202{
153 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; 203 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
154 int hpet = is_hpet_enabled(); 204 int hpet = is_hpet_enabled(), cpu;
155 205
156 local_irq_save(flags); 206 local_irq_save(flags);
157 207
@@ -162,9 +212,9 @@ void __init tsc_calibrate(void)
162 outb(0xb0, 0x43); 212 outb(0xb0, 0x43);
163 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 213 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
164 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 214 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
165 tr1 = get_cycles_sync(); 215 tr1 = get_cycles();
166 while ((inb(0x61) & 0x20) == 0); 216 while ((inb(0x61) & 0x20) == 0);
167 tr2 = get_cycles_sync(); 217 tr2 = get_cycles();
168 218
169 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 219 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
170 220
@@ -206,7 +256,9 @@ void __init tsc_calibrate(void)
206 } 256 }
207 257
208 tsc_khz = tsc2 / tsc1; 258 tsc_khz = tsc2 / tsc1;
209 set_cyc2ns_scale(tsc_khz); 259
260 for_each_possible_cpu(cpu)
261 set_cyc2ns_scale(tsc_khz, cpu);
210} 262}
211 263
212/* 264/*
@@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void)
222 if (apic_is_clustered_box()) 274 if (apic_is_clustered_box())
223 return 1; 275 return 1;
224#endif 276#endif
225 /* Most intel systems have synchronized TSCs except for 277
226 multi node systems */ 278 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
227 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
228#ifdef CONFIG_ACPI
229 /* But TSC doesn't tick in C3 so don't use it there */
230 if (acpi_gbl_FADT.header.length > 0 &&
231 acpi_gbl_FADT.C3latency < 1000)
232 return 1;
233#endif
234 return 0; 279 return 0;
235 }
236 280
237 /* Assume multi socket systems are not synchronized */ 281 /* Assume multi socket systems are not synchronized */
238 return num_present_cpus() > 1; 282 return num_present_cpus() > 1;
@@ -250,13 +294,13 @@ __setup("notsc", notsc_setup);
250/* clock source code: */ 294/* clock source code: */
251static cycle_t read_tsc(void) 295static cycle_t read_tsc(void)
252{ 296{
253 cycle_t ret = (cycle_t)get_cycles_sync(); 297 cycle_t ret = (cycle_t)get_cycles();
254 return ret; 298 return ret;
255} 299}
256 300
257static cycle_t __vsyscall_fn vread_tsc(void) 301static cycle_t __vsyscall_fn vread_tsc(void)
258{ 302{
259 cycle_t ret = (cycle_t)get_cycles_sync(); 303 cycle_t ret = (cycle_t)vget_cycles();
260 return ret; 304 return ret;
261} 305}
262 306
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9125efe66a06..0577825cf89b 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void)
46 cycles_t start, now, prev, end; 46 cycles_t start, now, prev, end;
47 int i; 47 int i;
48 48
49 start = get_cycles_sync(); 49 start = get_cycles();
50 /* 50 /*
51 * The measurement runs for 20 msecs: 51 * The measurement runs for 20 msecs:
52 */ 52 */
@@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void)
61 */ 61 */
62 __raw_spin_lock(&sync_lock); 62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc; 63 prev = last_tsc;
64 now = get_cycles_sync(); 64 now = get_cycles();
65 last_tsc = now; 65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock); 66 __raw_spin_unlock(&sync_lock);
67 67
68 /* 68 /*
69 * Be nice every now and then (and also check whether 69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million 70 * measurement is done [we also insert a 10 million
71 * loops safety exit, so we dont lock up in case the 71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]): 72 * TSC readout is totally broken]):
73 */ 73 */
74 if (unlikely(!(i & 7))) { 74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000) 75 if (now > end || i > 10000000)
76 break; 76 break;
77 cpu_relax(); 77 cpu_relax();
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
@@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void)
87 nr_warps++; 87 nr_warps++;
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 90 }
91 if (!(now-start)) {
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start);
94 WARN_ON(1);
91 } 95 }
92} 96}
93 97
@@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu)
129 while (atomic_read(&stop_count) != cpus-1) 133 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax(); 134 cpu_relax();
131 135
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) { 136 if (nr_warps) {
138 printk("\n"); 137 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 138 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp); 139 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable("check_tsc_sync_source failed"); 140 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else { 141 } else {
146 printk(" passed.\n"); 142 printk(" passed.\n");
147 } 143 }
148 144
149 /* 145 /*
146 * Reset it - just in case we boot another CPU later:
147 */
148 atomic_set(&start_count, 0);
149 nr_warps = 0;
150 max_warp = 0;
151 last_tsc = 0;
152
153 /*
150 * Let the target continue with the bootup: 154 * Let the target continue with the bootup:
151 */ 155 */
152 atomic_inc(&stop_count); 156 atomic_inc(&stop_count);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 157e4bedd3c5..738c2104df30 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
70/* 70/*
71 * 8- and 16-bit register defines.. 71 * 8- and 16-bit register defines..
72 */ 72 */
73#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) 73#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0])
74#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) 74#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1])
75#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) 75#define IP(regs) (*(unsigned short *)&((regs)->pt.ip))
76#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) 76#define SP(regs) (*(unsigned short *)&((regs)->pt.sp))
77 77
78/* 78/*
79 * virtual flags (16 and 32-bit versions) 79 * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
93{ 93{
94 int ret = 0; 94 int ret = 0;
95 95
96 /* kernel_vm86_regs is missing xgs, so copy everything up to 96 /* kernel_vm86_regs is missing gs, so copy everything up to
97 (but not including) orig_eax, and then rest including orig_eax. */ 97 (but not including) orig_eax, and then rest including orig_eax. */
98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax, 99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
100 sizeof(struct kernel_vm86_regs) - 100 sizeof(struct kernel_vm86_regs) -
101 offsetof(struct kernel_vm86_regs, pt.orig_eax)); 101 offsetof(struct kernel_vm86_regs, pt.orig_ax));
102 102
103 return ret; 103 return ret;
104} 104}
@@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
110{ 110{
111 int ret = 0; 111 int ret = 0;
112 112
113 /* copy eax-xfs inclusive */ 113 /* copy ax-fs inclusive */
114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
115 /* copy orig_eax-__gsh+extra */ 115 /* copy orig_ax-__gsh+extra */
116 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax, 116 ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
117 sizeof(struct kernel_vm86_regs) - 117 sizeof(struct kernel_vm86_regs) -
118 offsetof(struct kernel_vm86_regs, pt.orig_eax) + 118 offsetof(struct kernel_vm86_regs, pt.orig_ax) +
119 extra); 119 extra);
120 return ret; 120 return ret;
121} 121}
122 122
123struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 123struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
124struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
125{ 124{
126 struct tss_struct *tss; 125 struct tss_struct *tss;
127 struct pt_regs *ret; 126 struct pt_regs *ret;
@@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
138 printk("no vm86_info: BAD\n"); 137 printk("no vm86_info: BAD\n");
139 do_exit(SIGSEGV); 138 do_exit(SIGSEGV);
140 } 139 }
141 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 140 set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
142 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs); 141 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
143 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 142 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
144 if (tmp) { 143 if (tmp) {
@@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
147 } 146 }
148 147
149 tss = &per_cpu(init_tss, get_cpu()); 148 tss = &per_cpu(init_tss, get_cpu());
150 current->thread.esp0 = current->thread.saved_esp0; 149 current->thread.sp0 = current->thread.saved_sp0;
151 current->thread.sysenter_cs = __KERNEL_CS; 150 current->thread.sysenter_cs = __KERNEL_CS;
152 load_esp0(tss, &current->thread); 151 load_sp0(tss, &current->thread);
153 current->thread.saved_esp0 = 0; 152 current->thread.saved_sp0 = 0;
154 put_cpu(); 153 put_cpu();
155 154
156 ret = KVM86->regs32; 155 ret = KVM86->regs32;
157 156
158 ret->xfs = current->thread.saved_fs; 157 ret->fs = current->thread.saved_fs;
159 loadsegment(gs, current->thread.saved_gs); 158 loadsegment(gs, current->thread.saved_gs);
160 159
161 return ret; 160 return ret;
@@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
197 196
198asmlinkage int sys_vm86old(struct pt_regs regs) 197asmlinkage int sys_vm86old(struct pt_regs regs)
199{ 198{
200 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; 199 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
201 struct kernel_vm86_struct info; /* declare this _on top_, 200 struct kernel_vm86_struct info; /* declare this _on top_,
202 * this avoids wasting of stack space. 201 * this avoids wasting of stack space.
203 * This remains on the stack until we 202 * This remains on the stack until we
@@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
207 int tmp, ret = -EPERM; 206 int tmp, ret = -EPERM;
208 207
209 tsk = current; 208 tsk = current;
210 if (tsk->thread.saved_esp0) 209 if (tsk->thread.saved_sp0)
211 goto out; 210 goto out;
212 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 211 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
213 offsetof(struct kernel_vm86_struct, vm86plus) - 212 offsetof(struct kernel_vm86_struct, vm86plus) -
@@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
237 struct vm86plus_struct __user *v86; 236 struct vm86plus_struct __user *v86;
238 237
239 tsk = current; 238 tsk = current;
240 switch (regs.ebx) { 239 switch (regs.bx) {
241 case VM86_REQUEST_IRQ: 240 case VM86_REQUEST_IRQ:
242 case VM86_FREE_IRQ: 241 case VM86_FREE_IRQ:
243 case VM86_GET_IRQ_BITS: 242 case VM86_GET_IRQ_BITS:
244 case VM86_GET_AND_RESET_IRQ: 243 case VM86_GET_AND_RESET_IRQ:
245 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); 244 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
246 goto out; 245 goto out;
247 case VM86_PLUS_INSTALL_CHECK: 246 case VM86_PLUS_INSTALL_CHECK:
248 /* NOTE: on old vm86 stuff this will return the error 247 /* NOTE: on old vm86 stuff this will return the error
@@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
256 255
257 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 256 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
258 ret = -EPERM; 257 ret = -EPERM;
259 if (tsk->thread.saved_esp0) 258 if (tsk->thread.saved_sp0)
260 goto out; 259 goto out;
261 v86 = (struct vm86plus_struct __user *)regs.ecx; 260 v86 = (struct vm86plus_struct __user *)regs.cx;
262 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 261 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
263 offsetof(struct kernel_vm86_struct, regs32) - 262 offsetof(struct kernel_vm86_struct, regs32) -
264 sizeof(info.regs)); 263 sizeof(info.regs));
@@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
281/* 280/*
282 * make sure the vm86() system call doesn't try to do anything silly 281 * make sure the vm86() system call doesn't try to do anything silly
283 */ 282 */
284 info->regs.pt.xds = 0; 283 info->regs.pt.ds = 0;
285 info->regs.pt.xes = 0; 284 info->regs.pt.es = 0;
286 info->regs.pt.xfs = 0; 285 info->regs.pt.fs = 0;
287 286
288/* we are clearing gs later just before "jmp resume_userspace", 287/* we are clearing gs later just before "jmp resume_userspace",
289 * because it is not saved/restored. 288 * because it is not saved/restored.
290 */ 289 */
291 290
292/* 291/*
293 * The eflags register is also special: we cannot trust that the user 292 * The flags register is also special: we cannot trust that the user
294 * has set it up safely, so this makes sure interrupt etc flags are 293 * has set it up safely, so this makes sure interrupt etc flags are
295 * inherited from protected mode. 294 * inherited from protected mode.
296 */ 295 */
297 VEFLAGS = info->regs.pt.eflags; 296 VEFLAGS = info->regs.pt.flags;
298 info->regs.pt.eflags &= SAFE_MASK; 297 info->regs.pt.flags &= SAFE_MASK;
299 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; 298 info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
300 info->regs.pt.eflags |= VM_MASK; 299 info->regs.pt.flags |= VM_MASK;
301 300
302 switch (info->cpu_type) { 301 switch (info->cpu_type) {
303 case CPU_286: 302 case CPU_286:
@@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
315 } 314 }
316 315
317/* 316/*
318 * Save old state, set default return value (%eax) to 0 317 * Save old state, set default return value (%ax) to 0
319 */ 318 */
320 info->regs32->eax = 0; 319 info->regs32->ax = 0;
321 tsk->thread.saved_esp0 = tsk->thread.esp0; 320 tsk->thread.saved_sp0 = tsk->thread.sp0;
322 tsk->thread.saved_fs = info->regs32->xfs; 321 tsk->thread.saved_fs = info->regs32->fs;
323 savesegment(gs, tsk->thread.saved_gs); 322 savesegment(gs, tsk->thread.saved_gs);
324 323
325 tss = &per_cpu(init_tss, get_cpu()); 324 tss = &per_cpu(init_tss, get_cpu());
326 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 325 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
327 if (cpu_has_sep) 326 if (cpu_has_sep)
328 tsk->thread.sysenter_cs = 0; 327 tsk->thread.sysenter_cs = 0;
329 load_esp0(tss, &tsk->thread); 328 load_sp0(tss, &tsk->thread);
330 put_cpu(); 329 put_cpu();
331 330
332 tsk->thread.screen_bitmap = info->screen_bitmap; 331 tsk->thread.screen_bitmap = info->screen_bitmap;
@@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
352 struct pt_regs * regs32; 351 struct pt_regs * regs32;
353 352
354 regs32 = save_v86_state(regs16); 353 regs32 = save_v86_state(regs16);
355 regs32->eax = retval; 354 regs32->ax = retval;
356 __asm__ __volatile__("movl %0,%%esp\n\t" 355 __asm__ __volatile__("movl %0,%%esp\n\t"
357 "movl %1,%%ebp\n\t" 356 "movl %1,%%ebp\n\t"
358 "jmp resume_userspace" 357 "jmp resume_userspace"
@@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
373 372
374static inline void clear_TF(struct kernel_vm86_regs * regs) 373static inline void clear_TF(struct kernel_vm86_regs * regs)
375{ 374{
376 regs->pt.eflags &= ~TF_MASK; 375 regs->pt.flags &= ~TF_MASK;
377} 376}
378 377
379static inline void clear_AC(struct kernel_vm86_regs * regs) 378static inline void clear_AC(struct kernel_vm86_regs * regs)
380{ 379{
381 regs->pt.eflags &= ~AC_MASK; 380 regs->pt.flags &= ~AC_MASK;
382} 381}
383 382
384/* It is correct to call set_IF(regs) from the set_vflags_* 383/* It is correct to call set_IF(regs) from the set_vflags_*
385 * functions. However someone forgot to call clear_IF(regs) 384 * functions. However someone forgot to call clear_IF(regs)
386 * in the opposite case. 385 * in the opposite case.
387 * After the command sequence CLI PUSHF STI POPF you should 386 * After the command sequence CLI PUSHF STI POPF you should
388 * end up with interrups disabled, but you ended up with 387 * end up with interrupts disabled, but you ended up with
389 * interrupts enabled. 388 * interrupts enabled.
390 * ( I was testing my own changes, but the only bug I 389 * ( I was testing my own changes, but the only bug I
391 * could find was in a function I had not changed. ) 390 * could find was in a function I had not changed. )
392 * [KD] 391 * [KD]
393 */ 392 */
394 393
395static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 394static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
396{ 395{
397 set_flags(VEFLAGS, eflags, current->thread.v86mask); 396 set_flags(VEFLAGS, flags, current->thread.v86mask);
398 set_flags(regs->pt.eflags, eflags, SAFE_MASK); 397 set_flags(regs->pt.flags, flags, SAFE_MASK);
399 if (eflags & IF_MASK) 398 if (flags & IF_MASK)
400 set_IF(regs); 399 set_IF(regs);
401 else 400 else
402 clear_IF(regs); 401 clear_IF(regs);
@@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
405static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 404static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
406{ 405{
407 set_flags(VFLAGS, flags, current->thread.v86mask); 406 set_flags(VFLAGS, flags, current->thread.v86mask);
408 set_flags(regs->pt.eflags, flags, SAFE_MASK); 407 set_flags(regs->pt.flags, flags, SAFE_MASK);
409 if (flags & IF_MASK) 408 if (flags & IF_MASK)
410 set_IF(regs); 409 set_IF(regs);
411 else 410 else
@@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
414 413
415static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 414static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
416{ 415{
417 unsigned long flags = regs->pt.eflags & RETURN_MASK; 416 unsigned long flags = regs->pt.flags & RETURN_MASK;
418 417
419 if (VEFLAGS & VIF_MASK) 418 if (VEFLAGS & VIF_MASK)
420 flags |= IF_MASK; 419 flags |= IF_MASK;
@@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
518 unsigned long __user *intr_ptr; 517 unsigned long __user *intr_ptr;
519 unsigned long segoffs; 518 unsigned long segoffs;
520 519
521 if (regs->pt.xcs == BIOSSEG) 520 if (regs->pt.cs == BIOSSEG)
522 goto cannot_handle; 521 goto cannot_handle;
523 if (is_revectored(i, &KVM86->int_revectored)) 522 if (is_revectored(i, &KVM86->int_revectored))
524 goto cannot_handle; 523 goto cannot_handle;
@@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
530 if ((segoffs >> 16) == BIOSSEG) 529 if ((segoffs >> 16) == BIOSSEG)
531 goto cannot_handle; 530 goto cannot_handle;
532 pushw(ssp, sp, get_vflags(regs), cannot_handle); 531 pushw(ssp, sp, get_vflags(regs), cannot_handle);
533 pushw(ssp, sp, regs->pt.xcs, cannot_handle); 532 pushw(ssp, sp, regs->pt.cs, cannot_handle);
534 pushw(ssp, sp, IP(regs), cannot_handle); 533 pushw(ssp, sp, IP(regs), cannot_handle);
535 regs->pt.xcs = segoffs >> 16; 534 regs->pt.cs = segoffs >> 16;
536 SP(regs) -= 6; 535 SP(regs) -= 6;
537 IP(regs) = segoffs & 0xffff; 536 IP(regs) = segoffs & 0xffff;
538 clear_TF(regs); 537 clear_TF(regs);
@@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
549 if (VMPI.is_vm86pus) { 548 if (VMPI.is_vm86pus) {
550 if ( (trapno==3) || (trapno==1) ) 549 if ( (trapno==3) || (trapno==1) )
551 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 550 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
552 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); 551 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
553 return 0; 552 return 0;
554 } 553 }
555 if (trapno !=1) 554 if (trapno !=1)
@@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
585 handle_vm86_trap(regs, 0, 1); \ 584 handle_vm86_trap(regs, 0, 1); \
586 return; } while (0) 585 return; } while (0)
587 586
588 orig_flags = *(unsigned short *)&regs->pt.eflags; 587 orig_flags = *(unsigned short *)&regs->pt.flags;
589 588
590 csp = (unsigned char __user *) (regs->pt.xcs << 4); 589 csp = (unsigned char __user *) (regs->pt.cs << 4);
591 ssp = (unsigned char __user *) (regs->pt.xss << 4); 590 ssp = (unsigned char __user *) (regs->pt.ss << 4);
592 sp = SP(regs); 591 sp = SP(regs);
593 ip = IP(regs); 592 ip = IP(regs);
594 593
@@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
675 SP(regs) += 6; 674 SP(regs) += 6;
676 } 675 }
677 IP(regs) = newip; 676 IP(regs) = newip;
678 regs->pt.xcs = newcs; 677 regs->pt.cs = newcs;
679 CHECK_IF_IN_TRAP; 678 CHECK_IF_IN_TRAP;
680 if (data32) { 679 if (data32) {
681 set_vflags_long(newflags, regs); 680 set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index f02bad68abaa..4525bc2c2e19 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,7 +62,10 @@ static struct {
62 void (*cpuid)(void /* non-c */); 62 void (*cpuid)(void /* non-c */);
63 void (*_set_ldt)(u32 selector); 63 void (*_set_ldt)(u32 selector);
64 void (*set_tr)(u32 selector); 64 void (*set_tr)(u32 selector);
65 void (*set_kernel_stack)(u32 selector, u32 esp0); 65 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
66 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
67 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
68 void (*set_kernel_stack)(u32 selector, u32 sp0);
66 void (*allocate_page)(u32, u32, u32, u32, u32); 69 void (*allocate_page)(u32, u32, u32, u32, u32);
67 void (*release_page)(u32, u32); 70 void (*release_page)(u32, u32);
68 void (*set_pte)(pte_t, pte_t *, unsigned); 71 void (*set_pte)(pte_t, pte_t *, unsigned);
@@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops;
88#define IRQ_PATCH_DISABLE 5 91#define IRQ_PATCH_DISABLE 5
89 92
90static inline void patch_offset(void *insnbuf, 93static inline void patch_offset(void *insnbuf,
91 unsigned long eip, unsigned long dest) 94 unsigned long ip, unsigned long dest)
92{ 95{
93 *(unsigned long *)(insnbuf+1) = dest-eip-5; 96 *(unsigned long *)(insnbuf+1) = dest-ip-5;
94} 97}
95 98
96static unsigned patch_internal(int call, unsigned len, void *insnbuf, 99static unsigned patch_internal(int call, unsigned len, void *insnbuf,
97 unsigned long eip) 100 unsigned long ip)
98{ 101{
99 u64 reloc; 102 u64 reloc;
100 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; 103 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 case VMI_RELOCATION_CALL_REL: 106 case VMI_RELOCATION_CALL_REL:
104 BUG_ON(len < 5); 107 BUG_ON(len < 5);
105 *(char *)insnbuf = MNEM_CALL; 108 *(char *)insnbuf = MNEM_CALL;
106 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 109 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
107 return 5; 110 return 5;
108 111
109 case VMI_RELOCATION_JUMP_REL: 112 case VMI_RELOCATION_JUMP_REL:
110 BUG_ON(len < 5); 113 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_JMP; 114 *(char *)insnbuf = MNEM_JMP;
112 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 115 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5; 116 return 5;
114 117
115 case VMI_RELOCATION_NOP: 118 case VMI_RELOCATION_NOP:
@@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
131 * sequence. The callee does nop padding for us. 134 * sequence. The callee does nop padding for us.
132 */ 135 */
133static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, 136static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 137 unsigned long ip, unsigned len)
135{ 138{
136 switch (type) { 139 switch (type) {
137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 141 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 142 insns, ip);
140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable): 143 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 144 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 145 insns, ip);
143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 147 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 148 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.save_fl): 149 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 150 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 151 insns, ip);
149 case PARAVIRT_PATCH(pv_cpu_ops.iret): 152 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 153 return patch_internal(VMI_CALL_IRET, len, insns, ip);
151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): 154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
153 default: 156 default:
154 break; 157 break;
155 } 158 }
@@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
157} 160}
158 161
159/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ 162/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
160static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, 163static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
161 unsigned int *ecx, unsigned int *edx) 164 unsigned int *cx, unsigned int *dx)
162{ 165{
163 int override = 0; 166 int override = 0;
164 if (*eax == 1) 167 if (*ax == 1)
165 override = 1; 168 override = 1;
166 asm volatile ("call *%6" 169 asm volatile ("call *%6"
167 : "=a" (*eax), 170 : "=a" (*ax),
168 "=b" (*ebx), 171 "=b" (*bx),
169 "=c" (*ecx), 172 "=c" (*cx),
170 "=d" (*edx) 173 "=d" (*dx)
171 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); 174 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
172 if (override) { 175 if (override) {
173 if (disable_pse) 176 if (disable_pse)
174 *edx &= ~X86_FEATURE_PSE; 177 *dx &= ~X86_FEATURE_PSE;
175 if (disable_pge) 178 if (disable_pge)
176 *edx &= ~X86_FEATURE_PGE; 179 *dx &= ~X86_FEATURE_PGE;
177 if (disable_sep) 180 if (disable_sep)
178 *edx &= ~X86_FEATURE_SEP; 181 *dx &= ~X86_FEATURE_SEP;
179 if (disable_tsc) 182 if (disable_tsc)
180 *edx &= ~X86_FEATURE_TSC; 183 *dx &= ~X86_FEATURE_TSC;
181 if (disable_mtrr) 184 if (disable_mtrr)
182 *edx &= ~X86_FEATURE_MTRR; 185 *dx &= ~X86_FEATURE_MTRR;
183 } 186 }
184} 187}
185 188
186static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) 189static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
187{ 190{
188 if (gdt[nr].a != new->a || gdt[nr].b != new->b) 191 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
189 write_gdt_entry(gdt, nr, new->a, new->b); 192 write_gdt_entry(gdt, nr, new, 0);
190} 193}
191 194
192static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) 195static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
@@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
200static void vmi_set_ldt(const void *addr, unsigned entries) 203static void vmi_set_ldt(const void *addr, unsigned entries)
201{ 204{
202 unsigned cpu = smp_processor_id(); 205 unsigned cpu = smp_processor_id();
203 u32 low, high; 206 struct desc_struct desc;
204 207
205 pack_descriptor(&low, &high, (unsigned long)addr, 208 pack_descriptor(&desc, (unsigned long)addr,
206 entries * sizeof(struct desc_struct) - 1, 209 entries * sizeof(struct desc_struct) - 1,
207 DESCTYPE_LDT, 0); 210 DESC_LDT, 0);
208 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); 211 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
209 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); 212 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
210} 213}
211 214
@@ -214,17 +217,37 @@ static void vmi_set_tr(void)
214 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); 217 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
215} 218}
216 219
217static void vmi_load_esp0(struct tss_struct *tss, 220static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
221{
222 u32 *idt_entry = (u32 *)g;
223 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]);
224}
225
226static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
227 const void *desc, int type)
228{
229 u32 *gdt_entry = (u32 *)desc;
230 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]);
231}
232
233static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc)
235{
236 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]);
238}
239
240static void vmi_load_sp0(struct tss_struct *tss,
218 struct thread_struct *thread) 241 struct thread_struct *thread)
219{ 242{
220 tss->x86_tss.esp0 = thread->esp0; 243 tss->x86_tss.sp0 = thread->sp0;
221 244
222 /* This can only happen when SEP is enabled, no need to test "SEP"arately */ 245 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
223 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { 246 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
224 tss->x86_tss.ss1 = thread->sysenter_cs; 247 tss->x86_tss.ss1 = thread->sysenter_cs;
225 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 248 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
226 } 249 }
227 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); 250 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
228} 251}
229 252
230static void vmi_flush_tlb_user(void) 253static void vmi_flush_tlb_user(void)
@@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
375 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
376} 399}
377 400
378static void vmi_allocate_pd(u32 pfn) 401static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
379{ 402{
380 /* 403 /*
381 * This call comes in very early, before mem_map is setup. 404 * This call comes in very early, before mem_map is setup.
@@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep
452static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) 475static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
453{ 476{
454#ifdef CONFIG_X86_PAE 477#ifdef CONFIG_X86_PAE
455 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; 478 const pte_t pte = { .pte = pmdval.pmd };
456 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); 479 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
457#else 480#else
458 const pte_t pte = { pmdval.pud.pgd.pgd }; 481 const pte_t pte = { pmdval.pud.pgd.pgd };
@@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t
485static void vmi_set_pud(pud_t *pudp, pud_t pudval) 508static void vmi_set_pud(pud_t *pudp, pud_t pudval)
486{ 509{
487 /* Um, eww */ 510 /* Um, eww */
488 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; 511 const pte_t pte = { .pte = pudval.pgd.pgd };
489 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); 512 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
490 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 513 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
491} 514}
492 515
493static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 516static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
494{ 517{
495 const pte_t pte = { 0 }; 518 const pte_t pte = { .pte = 0 };
496 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 519 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
497 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 520 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
498} 521}
499 522
500static void vmi_pmd_clear(pmd_t *pmd) 523static void vmi_pmd_clear(pmd_t *pmd)
501{ 524{
502 const pte_t pte = { 0 }; 525 const pte_t pte = { .pte = 0 };
503 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); 526 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
504 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 527 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
505} 528}
@@ -790,10 +813,13 @@ static inline int __init activate_vmi(void)
790 para_fill(pv_cpu_ops.store_idt, GetIDT); 813 para_fill(pv_cpu_ops.store_idt, GetIDT);
791 para_fill(pv_cpu_ops.store_tr, GetTR); 814 para_fill(pv_cpu_ops.store_tr, GetTR);
792 pv_cpu_ops.load_tls = vmi_load_tls; 815 pv_cpu_ops.load_tls = vmi_load_tls;
793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); 816 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); 817 write_ldt_entry, WriteLDTEntry);
795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); 818 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 819 write_gdt_entry, WriteGDTEntry);
820 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
821 write_idt_entry, WriteIDTEntry);
822 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 823 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
798 para_fill(pv_cpu_ops.io_delay, IODelay); 824 para_fill(pv_cpu_ops.io_delay, IODelay);
799 825
@@ -870,7 +896,7 @@ static inline int __init activate_vmi(void)
870 * the backend. They are performance critical anyway, so requiring 896 * the backend. They are performance critical anyway, so requiring
871 * a patch is not a big problem. 897 * a patch is not a big problem.
872 */ 898 */
873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; 899 pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
874 pv_cpu_ops.iret = (void *)0xbadbab0; 900 pv_cpu_ops.iret = (void *)0xbadbab0;
875 901
876#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
@@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg)
963 return -EINVAL; 989 return -EINVAL;
964 990
965 if (!strcmp(arg, "disable_pge")) { 991 if (!strcmp(arg, "disable_pge")) {
966 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 992 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
967 disable_pge = 1; 993 disable_pge = 1;
968 } else if (!strcmp(arg, "disable_pse")) { 994 } else if (!strcmp(arg, "disable_pse")) {
969 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 995 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
970 disable_pse = 1; 996 disable_pse = 1;
971 } else if (!strcmp(arg, "disable_sep")) { 997 } else if (!strcmp(arg, "disable_sep")) {
972 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); 998 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
973 disable_sep = 1; 999 disable_sep = 1;
974 } else if (!strcmp(arg, "disable_tsc")) { 1000 } else if (!strcmp(arg, "disable_tsc")) {
975 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); 1001 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
976 disable_tsc = 1; 1002 disable_tsc = 1;
977 } else if (!strcmp(arg, "disable_mtrr")) { 1003 } else if (!strcmp(arg, "disable_mtrr")) {
978 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); 1004 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
979 disable_mtrr = 1; 1005 disable_mtrr = 1;
980 } else if (!strcmp(arg, "disable_timer")) { 1006 } else if (!strcmp(arg, "disable_timer")) {
981 disable_vmi_timer = 1; 1007 disable_vmi_timer = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index b1b5ab08b26e..a2b030780aa9 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -35,7 +35,6 @@
35#include <asm/i8253.h> 35#include <asm/i8253.h>
36 36
37#include <irq_vectors.h> 37#include <irq_vectors.h>
38#include "io_ports.h"
39 38
40#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void)
238void __init vmi_time_init(void) 237void __init vmi_time_init(void)
239{ 238{
240 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
241 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
242 241
243 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
244 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 84c913f38f98..f1148ac8abe3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -8,12 +8,6 @@
8 * put it inside the section definition. 8 * put it inside the section definition.
9 */ 9 */
10 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
17#define LOAD_OFFSET __PAGE_OFFSET 11#define LOAD_OFFSET __PAGE_OFFSET
18 12
19#include <asm-generic/vmlinux.lds.h> 13#include <asm-generic/vmlinux.lds.h>
@@ -44,6 +38,8 @@ SECTIONS
44 38
45 /* read-only */ 39 /* read-only */
46 .text : AT(ADDR(.text) - LOAD_OFFSET) { 40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(4096); /* not really needed, already page aligned */
42 *(.text.page_aligned)
47 TEXT_TEXT 43 TEXT_TEXT
48 SCHED_TEXT 44 SCHED_TEXT
49 LOCK_TEXT 45 LOCK_TEXT
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ea5386944e67..0992b9946c6f 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -37,16 +37,15 @@ SECTIONS
37 KPROBES_TEXT 37 KPROBES_TEXT
38 *(.fixup) 38 *(.fixup)
39 *(.gnu.warning) 39 *(.gnu.warning)
40 } :text = 0x9090 40 _etext = .; /* End of text section */
41 /* out-of-line lock text */ 41 } :text = 0x9090
42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
43
44 _etext = .; /* End of text section */
45 42
46 . = ALIGN(16); /* Exception table */ 43 . = ALIGN(16); /* Exception table */
47 __start___ex_table = .; 44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 45 __start___ex_table = .;
49 __stop___ex_table = .; 46 *(__ex_table)
47 __stop___ex_table = .;
48 }
50 49
51 NOTES :text :note 50 NOTES :text :note
52 51
@@ -179,6 +178,14 @@ SECTIONS
179 } 178 }
180 __con_initcall_end = .; 179 __con_initcall_end = .;
181 SECURITY_INIT 180 SECURITY_INIT
181
182 . = ALIGN(8);
183 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
184 __parainstructions = .;
185 *(.parainstructions)
186 __parainstructions_end = .;
187 }
188
182 . = ALIGN(8); 189 . = ALIGN(8);
183 __alt_instructions = .; 190 __alt_instructions = .;
184 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 191 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 414caf0c5f9a..d971210a6d36 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -25,21 +25,24 @@ static int __init vsmp_init(void)
25 return 0; 25 return 0;
26 26
27 /* Check if we are running on a ScaleMP vSMP box */ 27 /* Check if we are running on a ScaleMP vSMP box */
28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || 28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
29 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) 29 PCI_VENDOR_ID_SCALEMP) ||
30 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
31 PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
30 return 0; 32 return 0;
31 33
32 /* set vSMP magic bits to indicate vSMP capable kernel */ 34 /* set vSMP magic bits to indicate vSMP capable kernel */
33 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); 35 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
34 cap = readl(address); 36 cap = readl(address);
35 ctl = readl(address + 4); 37 ctl = readl(address + 4);
36 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); 38 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
39 cap, ctl);
37 if (cap & ctl & (1 << 4)) { 40 if (cap & ctl & (1 << 4)) {
38 /* Turn on vSMP IRQ fastpath handling (see system.h) */ 41 /* Turn on vSMP IRQ fastpath handling (see system.h) */
39 ctl &= ~(1 << 4); 42 ctl &= ~(1 << 4);
40 writel(ctl, address + 4); 43 writel(ctl, address + 4);
41 ctl = readl(address + 4); 44 ctl = readl(address + 4);
42 printk("vSMP CTL: control set to:0x%08x\n", ctl); 45 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
43 } 46 }
44 47
45 iounmap(address); 48 iounmap(address);
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
deleted file mode 100644
index a5ab3dc4fd25..000000000000
--- a/arch/x86/kernel/vsyscall_32.S
+++ /dev/null
@@ -1,15 +0,0 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/x86/kernel/vsyscall-int80_32.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
deleted file mode 100644
index 4a8b0ed9b8fb..000000000000
--- a/arch/x86/kernel/vsyscall_32.lds.S
+++ /dev/null
@@ -1,67 +0,0 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7
8SECTIONS
9{
10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) }
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK_asm + 0x400;
25
26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note
28 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
29 .eh_frame : { KEEP (*(.eh_frame)) } :text
30 .dynamic : { *(.dynamic) } :text :dynamic
31 .useless : {
32 *(.got.plt) *(.got)
33 *(.data .data.* .gnu.linkonce.d.*)
34 *(.dynbss)
35 *(.bss .bss.* .gnu.linkonce.b.*)
36 } :text
37}
38
39/*
40 * We must supply the ELF program headers explicitly to get just one
41 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
42 */
43PHDRS
44{
45 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
46 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
47 note PT_NOTE FLAGS(4); /* PF_R */
48 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
49}
50
51/*
52 * This controls what symbols we export from the DSO.
53 */
54VERSION
55{
56 LINUX_2.5 {
57 global:
58 __kernel_vsyscall;
59 __kernel_sigreturn;
60 __kernel_rt_sigreturn;
61
62 local: *;
63 };
64}
65
66/* The ELF entry point can be used to set the AT_SYSINFO value. */
67ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c6d4a1..3f8242774580 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46#define __syscall_clobber "r11","rcx","memory" 46#define __syscall_clobber "r11","cx","memory"
47#define __pa_vsymbol(x) \ 47#define __pa_vsymbol(x) \
48 ({unsigned long v; \ 48 ({unsigned long v; \
49 extern char __vsyscall_0; \ 49 extern char __vsyscall_0; \
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
190long __vsyscall(2) 190long __vsyscall(2)
191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
192{ 192{
193 unsigned int dummy, p; 193 unsigned int p;
194 unsigned long j = 0; 194 unsigned long j = 0;
195 195
196 /* Fast cache - only recompute value once per jiffies and avoid 196 /* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
205 p = tcache->blob[1]; 205 p = tcache->blob[1];
206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
207 /* Load per CPU data from RDTSCP */ 207 /* Load per CPU data from RDTSCP */
208 rdtscp(dummy, dummy, p); 208 native_read_tscp(&p);
209 } else { 209 } else {
210 /* Load per CPU data from GDT */ 210 /* Load per CPU data from GDT */
211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
297 /* Store cpu number in limit so that it can be loaded quickly 297 /* Store cpu number in limit so that it can be loaded quickly
298 in user space in vgetcpu. 298 in user space in vgetcpu.
299 12 bits for the CPU and 8 bits for the node. */ 299 12 bits for the CPU and 8 bits for the node. */
300 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); 300 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
301 *d = 0x0f40000000000ULL; 301 *d = 0x0f40000000000ULL;
302 *d |= cpu; 302 *d |= cpu;
303 *d |= (node & 0xf) << 12; 303 *d |= (node & 0xf) << 12;
@@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
319 return NOTIFY_DONE; 319 return NOTIFY_DONE;
320} 320}
321 321
322static void __init map_vsyscall(void) 322void __init map_vsyscall(void)
323{ 323{
324 extern char __vsyscall_0; 324 extern char __vsyscall_0;
325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -335,7 +335,6 @@ static int __init vsyscall_init(void)
335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); 337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
338 map_vsyscall();
339#ifdef CONFIG_SYSCTL 338#ifdef CONFIG_SYSCTL
340 register_sysctl_table(kernel_root_table2); 339 register_sysctl_table(kernel_root_table2);
341#endif 340#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 77c25b307635..a66e9c1a0537 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
8#include <asm/processor.h> 8#include <asm/processor.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h>
11 12
12EXPORT_SYMBOL(kernel_thread); 13EXPORT_SYMBOL(kernel_thread);
13 14
@@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
34EXPORT_SYMBOL(copy_page); 35EXPORT_SYMBOL(copy_page);
35EXPORT_SYMBOL(clear_page); 36EXPORT_SYMBOL(clear_page);
36 37
37#ifdef CONFIG_SMP
38extern void __write_lock_failed(rwlock_t *rw);
39extern void __read_lock_failed(rwlock_t *rw);
40EXPORT_SYMBOL(__write_lock_failed);
41EXPORT_SYMBOL(__read_lock_failed);
42#endif
43
44/* Export string functions. We normally rely on gcc builtin for most of these, 38/* Export string functions. We normally rely on gcc builtin for most of these,
45 but gcc sometimes decides not to inline them. */ 39 but gcc sometimes decides not to inline them. */
46#undef memcpy 40#undef memcpy
@@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 54EXPORT_SYMBOL(load_gs_index);
61 55
62EXPORT_SYMBOL(_proxy_pda); 56EXPORT_SYMBOL(_proxy_pda);
57
58#ifdef CONFIG_PARAVIRT
59/* Virtualized guests may want to use it */
60EXPORT_SYMBOL_GPL(cpu_gdt_descr);
61#endif
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 656920636cb2..c83e1c9b5129 100644
--- a/drivers/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,9 +1,12 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6
4menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
5 bool "Virtualization" 8 bool "Virtualization"
6 depends on X86 9 depends on HAVE_KVM || X86
7 default y 10 default y
8 ---help--- 11 ---help---
9 Say Y here to get to see options for using your Linux host to run other 12 Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@ if VIRTUALIZATION
16 19
17config KVM 20config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL 22 depends on HAVE_KVM && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
21 select ANON_INODES 24 select ANON_INODES
22 ---help--- 25 ---help---
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile
index e5a8f4d3e973..ffdd0b310784 100644
--- a/drivers/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,11 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
6obj-$(CONFIG_KVM) += kvm.o 10obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o 11kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a679157bc599..ab29cf2def47 100644
--- a/drivers/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -28,6 +28,8 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include "irq.h" 29#include "irq.h"
30 30
31#include <linux/kvm_host.h>
32
31/* 33/*
32 * set irq level. If an edge is detected, then the IRR is set to 1 34 * set irq level. If an edge is detected, then the IRR is set to 1
33 */ 35 */
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 return intno; 183 return intno;
182} 184}
183 185
184static void pic_reset(void *opaque) 186void kvm_pic_reset(struct kvm_kpic_state *s)
185{ 187{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0; 188 s->last_irr = 0;
189 s->irr = 0; 189 s->irr = 0;
190 s->imr = 0; 190 s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
209 addr &= 1; 209 addr &= 1;
210 if (addr == 0) { 210 if (addr == 0) {
211 if (val & 0x10) { 211 if (val & 0x10) {
212 pic_reset(s); /* init */ 212 kvm_pic_reset(s); /* init */
213 /* 213 /*
214 * deassert a pending interrupt 214 * deassert a pending interrupt
215 */ 215 */
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c
index 7628c7ff628f..e5714759e97f 100644
--- a/drivers/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -20,8 +20,8 @@
20 */ 20 */
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/kvm_host.h>
23 24
24#include "kvm.h"
25#include "irq.h" 25#include "irq.h"
26 26
27/* 27/*
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
63} 63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65 65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{ 67{
88 kvm_inject_apic_timer_irqs(vcpu); 68 kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000000..fa5ed5d59b5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28
29#include "iodev.h"
30#include "ioapic.h"
31#include "lapic.h"
32
33struct kvm;
34struct kvm_vcpu;
35
36typedef void irq_request_func(void *opaque, int level);
37
38struct kvm_kpic_state {
39 u8 last_irr; /* edge detection */
40 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */
43 u8 priority_add; /* highest irq priority */
44 u8 irq_base;
45 u8 read_reg_select;
46 u8 poll;
47 u8 special_mask;
48 u8 init_state;
49 u8 auto_eoi;
50 u8 rotate_on_auto_eoi;
51 u8 special_fully_nested_mode;
52 u8 init4; /* true if 4 byte init */
53 u8 elcr; /* PIIX edge/trigger selection */
54 u8 elcr_mask;
55 struct kvm_pic *pics_state;
56};
57
58struct kvm_pic {
59 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
60 irq_request_func *irq_request;
61 void *irq_request_opaque;
62 int output; /* intr from master PIC */
63 struct kvm_io_device dev;
64};
65
66struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s);
70
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{
73 return kvm->arch.vpic;
74}
75
76static inline int irqchip_in_kernel(struct kvm *kvm)
77{
78 return pic_irqchip(kvm) != NULL;
79}
80
81void kvm_pic_reset(struct kvm_kpic_state *s);
82
83void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87
88#endif
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index a0e415daef5b..ecdfe97e4635 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/list.h> 6#include <linux/list.h>
7#include <linux/kvm_host.h>
7#include <asm/msr.h> 8#include <asm/msr.h>
8 9
9#include "svm.h" 10#include "svm.h"
10#include "kvm.h"
11 11
12static const u32 host_save_user_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 238fcad3cece..2cbee9479ce4 100644
--- a/drivers/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -17,7 +17,7 @@
17 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
18 */ 18 */
19 19
20#include "kvm.h" 20#include <linux/kvm_host.h>
21#include <linux/kvm.h> 21#include <linux/kvm.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
@@ -56,6 +56,7 @@
56 56
57#define VEC_POS(v) ((v) & (32 - 1)) 57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4) 58#define REG_POS(v) (((v) >> 5) << 4)
59
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) 60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{ 61{
61 return *((u32 *) (apic->regs + reg_off)); 62 return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
88 89
89static inline int apic_hw_enabled(struct kvm_lapic *apic) 90static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{ 91{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; 92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
92} 93}
93 94
94static inline int apic_sw_enabled(struct kvm_lapic *apic) 95static inline int apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
172 173
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{ 175{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 176 struct kvm_lapic *apic = vcpu->arch.apic;
176 int highest_irr; 177 int highest_irr;
177 178
178 if (!apic) 179 if (!apic)
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
183} 184}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185 186
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) 187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
187{ 188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
188 if (!apic_test_and_set_irr(vec, apic)) { 191 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */ 192 /* a new pending irq is set in IRR */
190 if (trig) 193 if (trig)
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode) 271 int short_hand, int dest, int dest_mode)
269{ 272{
270 int result = 0; 273 int result = 0;
271 struct kvm_lapic *target = vcpu->apic; 274 struct kvm_lapic *target = vcpu->arch.apic;
272 275
273 apic_debug("target %p, source %p, dest 0x%x, " 276 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x", 277 "dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
335 } else 338 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR); 339 apic_clear_vector(vector, apic->regs + APIC_TMR);
337 340
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu); 342 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { 343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq)) 345 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq); 346 wake_up_interruptible(&vcpu->wq);
344 } 347 }
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
359 362
360 case APIC_DM_INIT: 363 case APIC_DM_INIT:
361 if (level) { 364 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG 366 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n", 367 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id); 368 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; 369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu); 370 kvm_vcpu_kick(vcpu);
368 } else { 371 } else {
369 printk(KERN_DEBUG 372 printk(KERN_DEBUG
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
376 case APIC_DM_STARTUP: 379 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector); 381 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { 382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq)) 385 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq); 386 wake_up_interruptible(&vcpu->wq);
384 } 387 }
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
392 return result; 395 return result;
393} 396}
394 397
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap) 399 unsigned long bitmap)
397{ 400{
398 int vcpu_id;
399 int last; 401 int last;
400 int next; 402 int next;
401 struct kvm_lapic *apic; 403 struct kvm_lapic *apic = NULL;
402 404
403 last = kvm->round_robin_prev_vcpu; 405 last = kvm->arch.round_robin_prev_vcpu;
404 next = last; 406 next = last;
405 407
406 do { 408 do {
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
408 next = 0; 410 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) 411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue; 412 continue;
411 apic = kvm->vcpus[next]->apic; 413 apic = kvm->vcpus[next]->arch.apic;
412 if (apic && apic_enabled(apic)) 414 if (apic && apic_enabled(apic))
413 break; 415 break;
414 apic = NULL; 416 apic = NULL;
415 } while (next != last); 417 } while (next != last);
416 kvm->round_robin_prev_vcpu = next; 418 kvm->arch.round_robin_prev_vcpu = next;
417 419
418 if (!apic) { 420 if (!apic)
419 vcpu_id = ffs(bitmap) - 1; 421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426 422
427 return apic; 423 return apic;
428} 424}
429 425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
430static void apic_set_eoi(struct kvm_lapic *apic) 437static void apic_set_eoi(struct kvm_lapic *apic)
431{ 438{
432 int vector = apic_find_highest_isr(apic); 439 int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK; 466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460 467
461 struct kvm_lapic *target; 468 struct kvm_vcpu *target;
462 struct kvm_vcpu *vcpu; 469 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0; 470 unsigned long lpr_map = 0;
464 int i; 471 int i;
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
474 if (!vcpu) 481 if (!vcpu)
475 continue; 482 continue;
476 483
477 if (vcpu->apic && 484 if (vcpu->arch.apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { 485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST) 486 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map); 487 set_bit(vcpu->vcpu_id, &lpr_map);
481 else 488 else
482 __apic_accept_irq(vcpu->apic, delivery_mode, 489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
483 vector, level, trig_mode); 490 vector, level, trig_mode);
484 } 491 }
485 } 492 }
486 493
487 if (delivery_mode == APIC_DM_LOWEST) { 494 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); 495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
489 if (target != NULL) 496 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode, 497 __apic_accept_irq(target->arch.apic, delivery_mode,
491 vector, level, trig_mode); 498 vector, level, trig_mode);
492 } 499 }
493} 500}
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
544 return tmcct; 551 return tmcct;
545} 552}
546 553
554static void __report_tpr_access(struct kvm_lapic *apic, bool write)
555{
556 struct kvm_vcpu *vcpu = apic->vcpu;
557 struct kvm_run *run = vcpu->run;
558
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write;
563}
564
565static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
566{
567 if (apic->vcpu->arch.tpr_access_reporting)
568 __report_tpr_access(apic, write);
569}
570
547static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) 571static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
548{ 572{
549 u32 val = 0; 573 u32 val = 0;
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
561 val = apic_get_tmcct(apic); 585 val = apic_get_tmcct(apic);
562 break; 586 break;
563 587
588 case APIC_TASKPRI:
589 report_tpr_access(apic, false);
590 /* fall thru */
564 default: 591 default:
565 apic_update_ppr(apic); 592 apic_update_ppr(apic);
566 val = apic_get_reg(apic, offset); 593 val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
670 break; 697 break;
671 698
672 case APIC_TASKPRI: 699 case APIC_TASKPRI:
700 report_tpr_access(apic, true);
673 apic_set_tpr(apic, val & 0xff); 701 apic_set_tpr(apic, val & 0xff);
674 break; 702 break;
675 703
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
762 return ret; 790 return ret;
763} 791}
764 792
765void kvm_free_apic(struct kvm_lapic *apic) 793void kvm_free_lapic(struct kvm_vcpu *vcpu)
766{ 794{
767 if (!apic) 795 if (!vcpu->arch.apic)
768 return; 796 return;
769 797
770 hrtimer_cancel(&apic->timer.dev); 798 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
771 799
772 if (apic->regs_page) { 800 if (vcpu->arch.apic->regs_page)
773 __free_page(apic->regs_page); 801 __free_page(vcpu->arch.apic->regs_page);
774 apic->regs_page = 0;
775 }
776 802
777 kfree(apic); 803 kfree(vcpu->arch.apic);
778} 804}
779 805
780/* 806/*
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
785 811
786void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) 812void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
787{ 813{
788 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 814 struct kvm_lapic *apic = vcpu->arch.apic;
789 815
790 if (!apic) 816 if (!apic)
791 return; 817 return;
792 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); 818 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
819 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
793} 820}
794 821
795u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 822u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
796{ 823{
797 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 824 struct kvm_lapic *apic = vcpu->arch.apic;
798 u64 tpr; 825 u64 tpr;
799 826
800 if (!apic) 827 if (!apic)
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
807 834
808void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 835void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
809{ 836{
810 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 837 struct kvm_lapic *apic = vcpu->arch.apic;
811 838
812 if (!apic) { 839 if (!apic) {
813 value |= MSR_IA32_APICBASE_BSP; 840 value |= MSR_IA32_APICBASE_BSP;
814 vcpu->apic_base = value; 841 vcpu->arch.apic_base = value;
815 return; 842 return;
816 } 843 }
817 if (apic->vcpu->vcpu_id) 844 if (apic->vcpu->vcpu_id)
818 value &= ~MSR_IA32_APICBASE_BSP; 845 value &= ~MSR_IA32_APICBASE_BSP;
819 846
820 vcpu->apic_base = value; 847 vcpu->arch.apic_base = value;
821 apic->base_address = apic->vcpu->apic_base & 848 apic->base_address = apic->vcpu->arch.apic_base &
822 MSR_IA32_APICBASE_BASE; 849 MSR_IA32_APICBASE_BASE;
823 850
824 /* with FSB delivery interrupt, we can restart APIC functionality */ 851 /* with FSB delivery interrupt, we can restart APIC functionality */
825 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " 852 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
826 "0x%lx.\n", apic->apic_base, apic->base_address); 853 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
827 854
828} 855}
829 856
830u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) 857u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
831{ 858{
832 return vcpu->apic_base; 859 return vcpu->arch.apic_base;
833} 860}
834EXPORT_SYMBOL_GPL(kvm_lapic_get_base); 861EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
835 862
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
841 apic_debug("%s\n", __FUNCTION__); 868 apic_debug("%s\n", __FUNCTION__);
842 869
843 ASSERT(vcpu); 870 ASSERT(vcpu);
844 apic = vcpu->apic; 871 apic = vcpu->arch.apic;
845 ASSERT(apic != NULL); 872 ASSERT(apic != NULL);
846 873
847 /* Stop the timer in case it's a reset to an active apic */ 874 /* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
872 update_divide_count(apic); 899 update_divide_count(apic);
873 atomic_set(&apic->timer.pending, 0); 900 atomic_set(&apic->timer.pending, 0);
874 if (vcpu->vcpu_id == 0) 901 if (vcpu->vcpu_id == 0)
875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP; 902 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
876 apic_update_ppr(apic); 903 apic_update_ppr(apic);
877 904
878 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 905 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
879 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, 906 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
880 vcpu, kvm_apic_id(apic), 907 vcpu, kvm_apic_id(apic),
881 vcpu->apic_base, apic->base_address); 908 vcpu->arch.apic_base, apic->base_address);
882} 909}
883EXPORT_SYMBOL_GPL(kvm_lapic_reset); 910EXPORT_SYMBOL_GPL(kvm_lapic_reset);
884 911
885int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 912int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
886{ 913{
887 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 914 struct kvm_lapic *apic = vcpu->arch.apic;
888 int ret = 0; 915 int ret = 0;
889 916
890 if (!apic) 917 if (!apic)
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
908 wait_queue_head_t *q = &apic->vcpu->wq; 935 wait_queue_head_t *q = &apic->vcpu->wq;
909 936
910 atomic_inc(&apic->timer.pending); 937 atomic_inc(&apic->timer.pending);
911 if (waitqueue_active(q)) 938 if (waitqueue_active(q)) {
912 { 939 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
913 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
914 wake_up_interruptible(q); 940 wake_up_interruptible(q);
915 } 941 }
916 if (apic_lvtt_period(apic)) { 942 if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
956 if (!apic) 982 if (!apic)
957 goto nomem; 983 goto nomem;
958 984
959 vcpu->apic = apic; 985 vcpu->arch.apic = apic;
960 986
961 apic->regs_page = alloc_page(GFP_KERNEL); 987 apic->regs_page = alloc_page(GFP_KERNEL);
962 if (apic->regs_page == NULL) { 988 if (apic->regs_page == NULL) {
963 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 989 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
964 vcpu->vcpu_id); 990 vcpu->vcpu_id);
965 goto nomem; 991 goto nomem_free_apic;
966 } 992 }
967 apic->regs = page_address(apic->regs_page); 993 apic->regs = page_address(apic->regs_page);
968 memset(apic->regs, 0, PAGE_SIZE); 994 memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
971 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 997 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
972 apic->timer.dev.function = apic_timer_fn; 998 apic->timer.dev.function = apic_timer_fn;
973 apic->base_address = APIC_DEFAULT_PHYS_BASE; 999 apic->base_address = APIC_DEFAULT_PHYS_BASE;
974 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; 1000 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
975 1001
976 kvm_lapic_reset(vcpu); 1002 kvm_lapic_reset(vcpu);
977 apic->dev.read = apic_mmio_read; 1003 apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
980 apic->dev.private = apic; 1006 apic->dev.private = apic;
981 1007
982 return 0; 1008 return 0;
1009nomem_free_apic:
1010 kfree(apic);
983nomem: 1011nomem:
984 kvm_free_apic(apic);
985 return -ENOMEM; 1012 return -ENOMEM;
986} 1013}
987EXPORT_SYMBOL_GPL(kvm_create_lapic); 1014EXPORT_SYMBOL_GPL(kvm_create_lapic);
988 1015
989int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 1016int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
990{ 1017{
991 struct kvm_lapic *apic = vcpu->apic; 1018 struct kvm_lapic *apic = vcpu->arch.apic;
992 int highest_irr; 1019 int highest_irr;
993 1020
994 if (!apic || !apic_enabled(apic)) 1021 if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1004 1031
1005int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1032int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1006{ 1033{
1007 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); 1034 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1008 int r = 0; 1035 int r = 0;
1009 1036
1010 if (vcpu->vcpu_id == 0) { 1037 if (vcpu->vcpu_id == 0) {
1011 if (!apic_hw_enabled(vcpu->apic)) 1038 if (!apic_hw_enabled(vcpu->arch.apic))
1012 r = 1; 1039 r = 1;
1013 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1040 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1014 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1041 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1019 1046
1020void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) 1047void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1021{ 1048{
1022 struct kvm_lapic *apic = vcpu->apic; 1049 struct kvm_lapic *apic = vcpu->arch.apic;
1023 1050
1024 if (apic && apic_lvt_enabled(apic, APIC_LVTT) && 1051 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1025 atomic_read(&apic->timer.pending) > 0) { 1052 atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1030 1057
1031void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 1058void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1032{ 1059{
1033 struct kvm_lapic *apic = vcpu->apic; 1060 struct kvm_lapic *apic = vcpu->arch.apic;
1034 1061
1035 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) 1062 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1036 apic->timer.last_update = ktime_add_ns( 1063 apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1041int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) 1068int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1042{ 1069{
1043 int vector = kvm_apic_has_interrupt(vcpu); 1070 int vector = kvm_apic_has_interrupt(vcpu);
1044 struct kvm_lapic *apic = vcpu->apic; 1071 struct kvm_lapic *apic = vcpu->arch.apic;
1045 1072
1046 if (vector == -1) 1073 if (vector == -1)
1047 return -1; 1074 return -1;
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1054 1081
1055void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1082void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1056{ 1083{
1057 struct kvm_lapic *apic = vcpu->apic; 1084 struct kvm_lapic *apic = vcpu->arch.apic;
1058 1085
1059 apic->base_address = vcpu->apic_base & 1086 apic->base_address = vcpu->arch.apic_base &
1060 MSR_IA32_APICBASE_BASE; 1087 MSR_IA32_APICBASE_BASE;
1061 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1088 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1062 apic_update_ppr(apic); 1089 apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1065 start_apic_timer(apic); 1092 start_apic_timer(apic);
1066} 1093}
1067 1094
1068void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1095void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1069{ 1096{
1070 struct kvm_lapic *apic = vcpu->apic; 1097 struct kvm_lapic *apic = vcpu->arch.apic;
1071 struct hrtimer *timer; 1098 struct hrtimer *timer;
1072 1099
1073 if (!apic) 1100 if (!apic)
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1077 if (hrtimer_cancel(timer)) 1104 if (hrtimer_cancel(timer))
1078 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 1105 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1079} 1106}
1080EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); 1107
1108void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1109{
1110 u32 data;
1111 void *vapic;
1112
1113 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1114 return;
1115
1116 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1117 data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
1118 kunmap_atomic(vapic, KM_USER0);
1119
1120 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1121}
1122
1123void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1124{
1125 u32 data, tpr;
1126 int max_irr, max_isr;
1127 struct kvm_lapic *apic;
1128 void *vapic;
1129
1130 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1131 return;
1132
1133 apic = vcpu->arch.apic;
1134 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1135 max_irr = apic_find_highest_irr(apic);
1136 if (max_irr < 0)
1137 max_irr = 0;
1138 max_isr = apic_find_highest_isr(apic);
1139 if (max_isr < 0)
1140 max_isr = 0;
1141 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
1142
1143 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1144 *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
1145 kunmap_atomic(vapic, KM_USER0);
1146}
1147
1148void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1149{
1150 if (!irqchip_in_kernel(vcpu->kvm))
1151 return;
1152
1153 vcpu->arch.apic->vapic_addr = vapic_addr;
1154}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000000..676c396c9cee
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H
3
4#include "iodev.h"
5
6#include <linux/kvm_host.h>
7
8struct kvm_lapic {
9 unsigned long base_address;
10 struct kvm_io_device dev;
11 struct {
12 atomic_t pending;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 ktime_t last_update;
16 struct hrtimer dev;
17 } timer;
18 struct kvm_vcpu *vcpu;
19 struct page *regs_page;
20 void *regs;
21 gpa_t vapic_addr;
22 struct page *vapic_page;
23};
24int kvm_create_lapic(struct kvm_vcpu *vcpu);
25void kvm_free_lapic(struct kvm_vcpu *vcpu);
26
27int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
28int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
29int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
30void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
38
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
45
46void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
47void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
48void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
49
50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000000..8efdcdbebb03
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295 pte_chain_cache, 4);
296 if (r)
297 goto out;
298 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299 rmap_desc_cache, 1);
300 if (r)
301 goto out;
302 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 if (r)
304 goto out;
305 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306 mmu_page_header_cache, 4);
307out:
308 return r;
309}
310
311static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
312{
313 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
314 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
315 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317}
318
319static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
320 size_t size)
321{
322 void *p;
323
324 BUG_ON(!mc->nobjs);
325 p = mc->objects[--mc->nobjs];
326 memset(p, 0, size);
327 return p;
328}
329
330static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
331{
332 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 sizeof(struct kvm_pte_chain));
334}
335
336static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337{
338 kfree(pc);
339}
340
341static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
342{
343 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 sizeof(struct kvm_rmap_desc));
345}
346
347static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348{
349 kfree(rd);
350}
351
352/*
353 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called
355 */
356
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
358{
359 struct kvm_memory_slot *slot;
360
361 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn];
363}
364
365/*
366 * Reverse mapping data structures:
367 *
368 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
369 * that points to page_address(page).
370 *
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings.
373 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375{
376 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc;
378 unsigned long *rmapp;
379 int i;
380
381 if (!is_rmap_pte(*spte))
382 return;
383 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
387 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte;
390 } else if (!(*rmapp & 1)) {
391 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392 desc = mmu_alloc_rmap_desc(vcpu);
393 desc->shadow_ptes[0] = (u64 *)*rmapp;
394 desc->shadow_ptes[1] = spte;
395 *rmapp = (unsigned long)desc | 1;
396 } else {
397 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
400 desc = desc->more;
401 if (desc->shadow_ptes[RMAP_EXT-1]) {
402 desc->more = mmu_alloc_rmap_desc(vcpu);
403 desc = desc->more;
404 }
405 for (i = 0; desc->shadow_ptes[i]; ++i)
406 ;
407 desc->shadow_ptes[i] = spte;
408 }
409}
410
411static void rmap_desc_remove_entry(unsigned long *rmapp,
412 struct kvm_rmap_desc *desc,
413 int i,
414 struct kvm_rmap_desc *prev_desc)
415{
416 int j;
417
418 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
419 ;
420 desc->shadow_ptes[i] = desc->shadow_ptes[j];
421 desc->shadow_ptes[j] = NULL;
422 if (j != 0)
423 return;
424 if (!prev_desc && !desc->more)
425 *rmapp = (unsigned long)desc->shadow_ptes[0];
426 else
427 if (prev_desc)
428 prev_desc->more = desc->more;
429 else
430 *rmapp = (unsigned long)desc->more | 1;
431 mmu_free_rmap_desc(desc);
432}
433
434static void rmap_remove(struct kvm *kvm, u64 *spte)
435{
436 struct kvm_rmap_desc *desc;
437 struct kvm_rmap_desc *prev_desc;
438 struct kvm_mmu_page *sp;
439 struct page *page;
440 unsigned long *rmapp;
441 int i;
442
443 if (!is_rmap_pte(*spte))
444 return;
445 sp = page_header(__pa(spte));
446 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447 mark_page_accessed(page);
448 if (is_writeble_pte(*spte))
449 kvm_release_page_dirty(page);
450 else
451 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG();
456 } else if (!(*rmapp & 1)) {
457 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
458 if ((u64 *)*rmapp != spte) {
459 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
460 spte, *spte);
461 BUG();
462 }
463 *rmapp = 0;
464 } else {
465 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
466 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 prev_desc = NULL;
468 while (desc) {
469 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
470 if (desc->shadow_ptes[i] == spte) {
471 rmap_desc_remove_entry(rmapp,
472 desc, i,
473 prev_desc);
474 return;
475 }
476 prev_desc = desc;
477 desc = desc->more;
478 }
479 BUG();
480 }
481}
482
483static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484{
485 struct kvm_rmap_desc *desc;
486 struct kvm_rmap_desc *prev_desc;
487 u64 *prev_spte;
488 int i;
489
490 if (!*rmapp)
491 return NULL;
492 else if (!(*rmapp & 1)) {
493 if (!spte)
494 return (u64 *)*rmapp;
495 return NULL;
496 }
497 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
498 prev_desc = NULL;
499 prev_spte = NULL;
500 while (desc) {
501 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
502 if (prev_spte == spte)
503 return desc->shadow_ptes[i];
504 prev_spte = desc->shadow_ptes[i];
505 }
506 desc = desc->more;
507 }
508 return NULL;
509}
510
511static void rmap_write_protect(struct kvm *kvm, u64 gfn)
512{
513 unsigned long *rmapp;
514 u64 *spte;
515 int write_protected = 0;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte)) {
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 write_protected = 1;
528 }
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531 if (write_protected)
532 kvm_flush_remote_tlbs(kvm);
533}
534
535#ifdef MMU_DEBUG
536static int is_empty_shadow_page(u64 *spt)
537{
538 u64 *pos;
539 u64 *end;
540
541 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
543 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
544 pos, *pos);
545 return 0;
546 }
547 return 1;
548}
549#endif
550
551static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552{
553 ASSERT(is_empty_shadow_page(sp->spt));
554 list_del(&sp->link);
555 __free_page(virt_to_page(sp->spt));
556 __free_page(virt_to_page(sp->gfns));
557 kfree(sp);
558 ++kvm->arch.n_free_mmu_pages;
559}
560
561static unsigned kvm_page_table_hashfn(gfn_t gfn)
562{
563 return gfn;
564}
565
566static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
567 u64 *parent_pte)
568{
569 struct kvm_mmu_page *sp;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 ++vcpu->kvm->stat.mmu_cache_miss;
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn, struct page *page)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 int was_writeble = is_writeble_pte(*shadow_pte);
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 spte |= PT_PRESENT_MASK;
912 if (pte_access & ACC_USER_MASK)
913 spte |= PT_USER_MASK;
914
915 if (is_error_page(page)) {
916 set_shadow_pte(shadow_pte,
917 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
918 kvm_release_page_clean(page);
919 return;
920 }
921
922 spte |= page_to_phys(page);
923
924 if ((pte_access & ACC_WRITE_MASK)
925 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
926 struct kvm_mmu_page *shadow;
927
928 spte |= PT_WRITABLE_MASK;
929 if (user_fault) {
930 mmu_unshadow(vcpu->kvm, gfn);
931 goto unshadowed;
932 }
933
934 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
935 if (shadow) {
936 pgprintk("%s: found shadow page for %lx, marking ro\n",
937 __FUNCTION__, gfn);
938 pte_access &= ~ACC_WRITE_MASK;
939 if (is_writeble_pte(spte)) {
940 spte &= ~PT_WRITABLE_MASK;
941 kvm_x86_ops->tlb_flush(vcpu);
942 }
943 if (write_fault)
944 *ptwrite = 1;
945 }
946 }
947
948unshadowed:
949
950 if (pte_access & ACC_WRITE_MASK)
951 mark_page_dirty(vcpu->kvm, gfn);
952
953 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
954 set_shadow_pte(shadow_pte, spte);
955 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
956 if (!was_rmapped) {
957 rmap_add(vcpu, shadow_pte, gfn);
958 if (!is_rmap_pte(*shadow_pte))
959 kvm_release_page_clean(page);
960 } else {
961 if (was_writeble)
962 kvm_release_page_dirty(page);
963 else
964 kvm_release_page_clean(page);
965 }
966 if (!ptwrite || !*ptwrite)
967 vcpu->arch.last_pte_updated = shadow_pte;
968}
969
970static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
971{
972}
973
974static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
975 gfn_t gfn, struct page *page)
976{
977 int level = PT32E_ROOT_LEVEL;
978 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
979 int pt_write = 0;
980
981 for (; ; level--) {
982 u32 index = PT64_INDEX(v, level);
983 u64 *table;
984
985 ASSERT(VALID_PAGE(table_addr));
986 table = __va(table_addr);
987
988 if (level == 1) {
989 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
990 0, write, 1, &pt_write, gfn, page);
991 return pt_write || is_io_pte(table[index]);
992 }
993
994 if (table[index] == shadow_trap_nonpresent_pte) {
995 struct kvm_mmu_page *new_table;
996 gfn_t pseudo_gfn;
997
998 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
999 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1,
1002 1, ACC_ALL, &table[index],
1003 NULL);
1004 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page);
1007 return -ENOMEM;
1008 }
1009
1010 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1011 | PT_WRITABLE_MASK | PT_USER_MASK;
1012 }
1013 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1014 }
1015}
1016
1017static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1018{
1019 int r;
1020
1021 struct page *page;
1022
1023 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn);
1025
1026 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock);
1030
1031 up_read(&current->mm->mmap_sem);
1032
1033 return r;
1034}
1035
1036
1037static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1038 struct kvm_mmu_page *sp)
1039{
1040 int i;
1041
1042 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1043 sp->spt[i] = shadow_trap_nonpresent_pte;
1044}
1045
1046static void mmu_free_roots(struct kvm_vcpu *vcpu)
1047{
1048 int i;
1049 struct kvm_mmu_page *sp;
1050
1051 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1052 return;
1053 spin_lock(&vcpu->kvm->mmu_lock);
1054#ifdef CONFIG_X86_64
1055 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1056 hpa_t root = vcpu->arch.mmu.root_hpa;
1057
1058 sp = page_header(root);
1059 --sp->root_count;
1060 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1061 spin_unlock(&vcpu->kvm->mmu_lock);
1062 return;
1063 }
1064#endif
1065 for (i = 0; i < 4; ++i) {
1066 hpa_t root = vcpu->arch.mmu.pae_root[i];
1067
1068 if (root) {
1069 root &= PT64_BASE_ADDR_MASK;
1070 sp = page_header(root);
1071 --sp->root_count;
1072 }
1073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1074 }
1075 spin_unlock(&vcpu->kvm->mmu_lock);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077}
1078
1079static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1080{
1081 int i;
1082 gfn_t root_gfn;
1083 struct kvm_mmu_page *sp;
1084
1085 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1086
1087#ifdef CONFIG_X86_64
1088 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1089 hpa_t root = vcpu->arch.mmu.root_hpa;
1090
1091 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1094 root = __pa(sp->spt);
1095 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root;
1097 return;
1098 }
1099#endif
1100 for (i = 0; i < 4; ++i) {
1101 hpa_t root = vcpu->arch.mmu.pae_root[i];
1102
1103 ASSERT(!VALID_PAGE(root));
1104 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1105 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1106 vcpu->arch.mmu.pae_root[i] = 0;
1107 continue;
1108 }
1109 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1110 } else if (vcpu->arch.mmu.root_level == 0)
1111 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL);
1115 root = __pa(sp->spt);
1116 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1118 }
1119 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1120}
1121
1122static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1123{
1124 return vaddr;
1125}
1126
1127static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1128 u32 error_code)
1129{
1130 gfn_t gfn;
1131 int r;
1132
1133 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1134 r = mmu_topup_memory_caches(vcpu);
1135 if (r)
1136 return r;
1137
1138 ASSERT(vcpu);
1139 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1140
1141 gfn = gva >> PAGE_SHIFT;
1142
1143 return nonpaging_map(vcpu, gva & PAGE_MASK,
1144 error_code & PFERR_WRITE_MASK, gfn);
1145}
1146
1147static void nonpaging_free(struct kvm_vcpu *vcpu)
1148{
1149 mmu_free_roots(vcpu);
1150}
1151
1152static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1153{
1154 struct kvm_mmu *context = &vcpu->arch.mmu;
1155
1156 context->new_cr3 = nonpaging_new_cr3;
1157 context->page_fault = nonpaging_page_fault;
1158 context->gva_to_gpa = nonpaging_gva_to_gpa;
1159 context->free = nonpaging_free;
1160 context->prefetch_page = nonpaging_prefetch_page;
1161 context->root_level = 0;
1162 context->shadow_root_level = PT32E_ROOT_LEVEL;
1163 context->root_hpa = INVALID_PAGE;
1164 return 0;
1165}
1166
1167void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1168{
1169 ++vcpu->stat.tlb_flush;
1170 kvm_x86_ops->tlb_flush(vcpu);
1171}
1172
1173static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1176 mmu_free_roots(vcpu);
1177}
1178
1179static void inject_page_fault(struct kvm_vcpu *vcpu,
1180 u64 addr,
1181 u32 err_code)
1182{
1183 kvm_inject_page_fault(vcpu, addr, err_code);
1184}
1185
1186static void paging_free(struct kvm_vcpu *vcpu)
1187{
1188 nonpaging_free(vcpu);
1189}
1190
1191#define PTTYPE 64
1192#include "paging_tmpl.h"
1193#undef PTTYPE
1194
1195#define PTTYPE 32
1196#include "paging_tmpl.h"
1197#undef PTTYPE
1198
1199static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1200{
1201 struct kvm_mmu *context = &vcpu->arch.mmu;
1202
1203 ASSERT(is_pae(vcpu));
1204 context->new_cr3 = paging_new_cr3;
1205 context->page_fault = paging64_page_fault;
1206 context->gva_to_gpa = paging64_gva_to_gpa;
1207 context->prefetch_page = paging64_prefetch_page;
1208 context->free = paging_free;
1209 context->root_level = level;
1210 context->shadow_root_level = level;
1211 context->root_hpa = INVALID_PAGE;
1212 return 0;
1213}
1214
1215static int paging64_init_context(struct kvm_vcpu *vcpu)
1216{
1217 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1218}
1219
1220static int paging32_init_context(struct kvm_vcpu *vcpu)
1221{
1222 struct kvm_mmu *context = &vcpu->arch.mmu;
1223
1224 context->new_cr3 = paging_new_cr3;
1225 context->page_fault = paging32_page_fault;
1226 context->gva_to_gpa = paging32_gva_to_gpa;
1227 context->free = paging_free;
1228 context->prefetch_page = paging32_prefetch_page;
1229 context->root_level = PT32_ROOT_LEVEL;
1230 context->shadow_root_level = PT32E_ROOT_LEVEL;
1231 context->root_hpa = INVALID_PAGE;
1232 return 0;
1233}
1234
1235static int paging32E_init_context(struct kvm_vcpu *vcpu)
1236{
1237 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1238}
1239
1240static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1241{
1242 ASSERT(vcpu);
1243 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1244
1245 if (!is_paging(vcpu))
1246 return nonpaging_init_context(vcpu);
1247 else if (is_long_mode(vcpu))
1248 return paging64_init_context(vcpu);
1249 else if (is_pae(vcpu))
1250 return paging32E_init_context(vcpu);
1251 else
1252 return paging32_init_context(vcpu);
1253}
1254
1255static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1256{
1257 ASSERT(vcpu);
1258 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1259 vcpu->arch.mmu.free(vcpu);
1260 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1261 }
1262}
1263
1264int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1265{
1266 destroy_kvm_mmu(vcpu);
1267 return init_kvm_mmu(vcpu);
1268}
1269EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1270
1271int kvm_mmu_load(struct kvm_vcpu *vcpu)
1272{
1273 int r;
1274
1275 r = mmu_topup_memory_caches(vcpu);
1276 if (r)
1277 goto out;
1278 spin_lock(&vcpu->kvm->mmu_lock);
1279 kvm_mmu_free_some_pages(vcpu);
1280 mmu_alloc_roots(vcpu);
1281 spin_unlock(&vcpu->kvm->mmu_lock);
1282 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1283 kvm_mmu_flush_tlb(vcpu);
1284out:
1285 return r;
1286}
1287EXPORT_SYMBOL_GPL(kvm_mmu_load);
1288
1289void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1290{
1291 mmu_free_roots(vcpu);
1292}
1293
1294static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1295 struct kvm_mmu_page *sp,
1296 u64 *spte)
1297{
1298 u64 pte;
1299 struct kvm_mmu_page *child;
1300
1301 pte = *spte;
1302 if (is_shadow_present_pte(pte)) {
1303 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1304 rmap_remove(vcpu->kvm, spte);
1305 else {
1306 child = page_header(pte & PT64_BASE_ADDR_MASK);
1307 mmu_page_remove_parent_pte(child, spte);
1308 }
1309 }
1310 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1311}
1312
1313static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1314 struct kvm_mmu_page *sp,
1315 u64 *spte,
1316 const void *new, int bytes,
1317 int offset_in_pte)
1318{
1319 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1320 ++vcpu->kvm->stat.mmu_pde_zapped;
1321 return;
1322 }
1323
1324 ++vcpu->kvm->stat.mmu_pte_updated;
1325 if (sp->role.glevels == PT32_ROOT_LEVEL)
1326 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1327 else
1328 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1329}
1330
1331static bool need_remote_flush(u64 old, u64 new)
1332{
1333 if (!is_shadow_present_pte(old))
1334 return false;
1335 if (!is_shadow_present_pte(new))
1336 return true;
1337 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1338 return true;
1339 old ^= PT64_NX_MASK;
1340 new ^= PT64_NX_MASK;
1341 return (old & ~new & PT64_PERM_MASK) != 0;
1342}
1343
1344static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1345{
1346 if (need_remote_flush(old, new))
1347 kvm_flush_remote_tlbs(vcpu->kvm);
1348 else
1349 kvm_mmu_flush_tlb(vcpu);
1350}
1351
1352static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1353{
1354 u64 *spte = vcpu->arch.last_pte_updated;
1355
1356 return !!(spte && (*spte & PT_ACCESSED_MASK));
1357}
1358
1359static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1360 const u8 *new, int bytes)
1361{
1362 gfn_t gfn;
1363 int r;
1364 u64 gpte = 0;
1365
1366 if (bytes != 4 && bytes != 8)
1367 return;
1368
1369 /*
1370 * Assume that the pte write on a page table of the same type
1371 * as the current vcpu paging mode. This is nearly always true
1372 * (might be false while changing modes). Note it is verified later
1373 * by update_pte().
1374 */
1375 if (is_pae(vcpu)) {
1376 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1377 if ((bytes == 4) && (gpa % 4 == 0)) {
1378 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1379 if (r)
1380 return;
1381 memcpy((void *)&gpte + (gpa % 8), new, 4);
1382 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1383 memcpy((void *)&gpte, new, 8);
1384 }
1385 } else {
1386 if ((bytes == 4) && (gpa % 4 == 0))
1387 memcpy((void *)&gpte, new, 4);
1388 }
1389 if (!is_present_pte(gpte))
1390 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1392 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394}
1395
1396void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1397 const u8 *new, int bytes)
1398{
1399 gfn_t gfn = gpa >> PAGE_SHIFT;
1400 struct kvm_mmu_page *sp;
1401 struct hlist_node *node, *n;
1402 struct hlist_head *bucket;
1403 unsigned index;
1404 u64 entry;
1405 u64 *spte;
1406 unsigned offset = offset_in_page(gpa);
1407 unsigned pte_size;
1408 unsigned page_offset;
1409 unsigned misaligned;
1410 unsigned quadrant;
1411 int level;
1412 int flooded = 0;
1413 int npte;
1414
1415 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1416 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1417 spin_lock(&vcpu->kvm->mmu_lock);
1418 kvm_mmu_free_some_pages(vcpu);
1419 ++vcpu->kvm->stat.mmu_pte_write;
1420 kvm_mmu_audit(vcpu, "pre pte write");
1421 if (gfn == vcpu->arch.last_pt_write_gfn
1422 && !last_updated_pte_accessed(vcpu)) {
1423 ++vcpu->arch.last_pt_write_count;
1424 if (vcpu->arch.last_pt_write_count >= 3)
1425 flooded = 1;
1426 } else {
1427 vcpu->arch.last_pt_write_gfn = gfn;
1428 vcpu->arch.last_pt_write_count = 1;
1429 vcpu->arch.last_pte_updated = NULL;
1430 }
1431 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1432 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1433 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1434 if (sp->gfn != gfn || sp->role.metaphysical)
1435 continue;
1436 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1437 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1438 misaligned |= bytes < 4;
1439 if (misaligned || flooded) {
1440 /*
1441 * Misaligned accesses are too much trouble to fix
1442 * up; also, they usually indicate a page is not used
1443 * as a page table.
1444 *
1445 * If we're seeing too many writes to a page,
1446 * it may no longer be a page table, or we may be
1447 * forking, in which case it is better to unmap the
1448 * page.
1449 */
1450 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1451 gpa, bytes, sp->role.word);
1452 kvm_mmu_zap_page(vcpu->kvm, sp);
1453 ++vcpu->kvm->stat.mmu_flooded;
1454 continue;
1455 }
1456 page_offset = offset;
1457 level = sp->role.level;
1458 npte = 1;
1459 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1460 page_offset <<= 1; /* 32->64 */
1461 /*
1462 * A 32-bit pde maps 4MB while the shadow pdes map
1463 * only 2MB. So we need to double the offset again
1464 * and zap two pdes instead of one.
1465 */
1466 if (level == PT32_ROOT_LEVEL) {
1467 page_offset &= ~7; /* kill rounding error */
1468 page_offset <<= 1;
1469 npte = 2;
1470 }
1471 quadrant = page_offset >> PAGE_SHIFT;
1472 page_offset &= ~PAGE_MASK;
1473 if (quadrant != sp->role.quadrant)
1474 continue;
1475 }
1476 spte = &sp->spt[page_offset / sizeof(*spte)];
1477 while (npte--) {
1478 entry = *spte;
1479 mmu_pte_write_zap_pte(vcpu, sp, spte);
1480 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1481 page_offset & (pte_size - 1));
1482 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1483 ++spte;
1484 }
1485 }
1486 kvm_mmu_audit(vcpu, "post pte write");
1487 spin_unlock(&vcpu->kvm->mmu_lock);
1488 if (vcpu->arch.update_pte.page) {
1489 kvm_release_page_clean(vcpu->arch.update_pte.page);
1490 vcpu->arch.update_pte.page = NULL;
1491 }
1492}
1493
1494int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1495{
1496 gpa_t gpa;
1497 int r;
1498
1499 down_read(&current->mm->mmap_sem);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem);
1502
1503 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1505 spin_unlock(&vcpu->kvm->mmu_lock);
1506 return r;
1507}
1508
1509void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1510{
1511 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1512 struct kvm_mmu_page *sp;
1513
1514 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1515 struct kvm_mmu_page, link);
1516 kvm_mmu_zap_page(vcpu->kvm, sp);
1517 ++vcpu->kvm->stat.mmu_recycled;
1518 }
1519}
1520
1521int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1522{
1523 int r;
1524 enum emulation_result er;
1525
1526 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1527 if (r < 0)
1528 goto out;
1529
1530 if (!r) {
1531 r = 1;
1532 goto out;
1533 }
1534
1535 r = mmu_topup_memory_caches(vcpu);
1536 if (r)
1537 goto out;
1538
1539 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1540
1541 switch (er) {
1542 case EMULATE_DONE:
1543 return 1;
1544 case EMULATE_DO_MMIO:
1545 ++vcpu->stat.mmio_exits;
1546 return 0;
1547 case EMULATE_FAIL:
1548 kvm_report_emulation_failure(vcpu, "pagetable");
1549 return 1;
1550 default:
1551 BUG();
1552 }
1553out:
1554 return r;
1555}
1556EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1557
1558static void free_mmu_pages(struct kvm_vcpu *vcpu)
1559{
1560 struct kvm_mmu_page *sp;
1561
1562 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1563 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1564 struct kvm_mmu_page, link);
1565 kvm_mmu_zap_page(vcpu->kvm, sp);
1566 }
1567 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1568}
1569
1570static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1571{
1572 struct page *page;
1573 int i;
1574
1575 ASSERT(vcpu);
1576
1577 if (vcpu->kvm->arch.n_requested_mmu_pages)
1578 vcpu->kvm->arch.n_free_mmu_pages =
1579 vcpu->kvm->arch.n_requested_mmu_pages;
1580 else
1581 vcpu->kvm->arch.n_free_mmu_pages =
1582 vcpu->kvm->arch.n_alloc_mmu_pages;
1583 /*
1584 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1585 * Therefore we need to allocate shadow page tables in the first
1586 * 4GB of memory, which happens to fit the DMA32 zone.
1587 */
1588 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1589 if (!page)
1590 goto error_1;
1591 vcpu->arch.mmu.pae_root = page_address(page);
1592 for (i = 0; i < 4; ++i)
1593 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1594
1595 return 0;
1596
1597error_1:
1598 free_mmu_pages(vcpu);
1599 return -ENOMEM;
1600}
1601
1602int kvm_mmu_create(struct kvm_vcpu *vcpu)
1603{
1604 ASSERT(vcpu);
1605 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1606
1607 return alloc_mmu_pages(vcpu);
1608}
1609
1610int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1611{
1612 ASSERT(vcpu);
1613 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1614
1615 return init_kvm_mmu(vcpu);
1616}
1617
1618void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1619{
1620 ASSERT(vcpu);
1621
1622 destroy_kvm_mmu(vcpu);
1623 free_mmu_pages(vcpu);
1624 mmu_free_memory_caches(vcpu);
1625}
1626
1627void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1628{
1629 struct kvm_mmu_page *sp;
1630
1631 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1632 int i;
1633 u64 *pt;
1634
1635 if (!test_bit(slot, &sp->slot_bitmap))
1636 continue;
1637
1638 pt = sp->spt;
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 /* avoid RMW */
1641 if (pt[i] & PT_WRITABLE_MASK)
1642 pt[i] &= ~PT_WRITABLE_MASK;
1643 }
1644}
1645
1646void kvm_mmu_zap_all(struct kvm *kvm)
1647{
1648 struct kvm_mmu_page *sp, *node;
1649
1650 spin_lock(&kvm->mmu_lock);
1651 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1652 kvm_mmu_zap_page(kvm, sp);
1653 spin_unlock(&kvm->mmu_lock);
1654
1655 kvm_flush_remote_tlbs(kvm);
1656}
1657
1658void kvm_mmu_module_exit(void)
1659{
1660 if (pte_chain_cache)
1661 kmem_cache_destroy(pte_chain_cache);
1662 if (rmap_desc_cache)
1663 kmem_cache_destroy(rmap_desc_cache);
1664 if (mmu_page_header_cache)
1665 kmem_cache_destroy(mmu_page_header_cache);
1666}
1667
1668int kvm_mmu_module_init(void)
1669{
1670 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1671 sizeof(struct kvm_pte_chain),
1672 0, 0, NULL);
1673 if (!pte_chain_cache)
1674 goto nomem;
1675 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1676 sizeof(struct kvm_rmap_desc),
1677 0, 0, NULL);
1678 if (!rmap_desc_cache)
1679 goto nomem;
1680
1681 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1682 sizeof(struct kvm_mmu_page),
1683 0, 0, NULL);
1684 if (!mmu_page_header_cache)
1685 goto nomem;
1686
1687 return 0;
1688
1689nomem:
1690 kvm_mmu_module_exit();
1691 return -ENOMEM;
1692}
1693
1694/*
1695 * Caculate mmu pages needed for kvm.
1696 */
1697unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1698{
1699 int i;
1700 unsigned int nr_mmu_pages;
1701 unsigned int nr_pages = 0;
1702
1703 for (i = 0; i < kvm->nmemslots; i++)
1704 nr_pages += kvm->memslots[i].npages;
1705
1706 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1707 nr_mmu_pages = max(nr_mmu_pages,
1708 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1709
1710 return nr_mmu_pages;
1711}
1712
1713#ifdef AUDIT
1714
1715static const char *audit_msg;
1716
1717static gva_t canonicalize(gva_t gva)
1718{
1719#ifdef CONFIG_X86_64
1720 gva = (long long)(gva << 16) >> 16;
1721#endif
1722 return gva;
1723}
1724
1725static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1726 gva_t va, int level)
1727{
1728 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1729 int i;
1730 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1731
1732 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1733 u64 ent = pt[i];
1734
1735 if (ent == shadow_trap_nonpresent_pte)
1736 continue;
1737
1738 va = canonicalize(va);
1739 if (level > 1) {
1740 if (ent == shadow_notrap_nonpresent_pte)
1741 printk(KERN_ERR "audit: (%s) nontrapping pte"
1742 " in nonleaf level: levels %d gva %lx"
1743 " level %d pte %llx\n", audit_msg,
1744 vcpu->arch.mmu.root_level, va, level, ent);
1745
1746 audit_mappings_page(vcpu, ent, va, level - 1);
1747 } else {
1748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1749 struct page *page = gpa_to_page(vcpu, gpa);
1750 hpa_t hpa = page_to_phys(page);
1751
1752 if (is_shadow_present_pte(ent)
1753 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1754 printk(KERN_ERR "xx audit error: (%s) levels %d"
1755 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1756 audit_msg, vcpu->arch.mmu.root_level,
1757 va, gpa, hpa, ent,
1758 is_shadow_present_pte(ent));
1759 else if (ent == shadow_notrap_nonpresent_pte
1760 && !is_error_hpa(hpa))
1761 printk(KERN_ERR "audit: (%s) notrap shadow,"
1762 " valid guest gva %lx\n", audit_msg, va);
1763 kvm_release_page_clean(page);
1764
1765 }
1766 }
1767}
1768
1769static void audit_mappings(struct kvm_vcpu *vcpu)
1770{
1771 unsigned i;
1772
1773 if (vcpu->arch.mmu.root_level == 4)
1774 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1775 else
1776 for (i = 0; i < 4; ++i)
1777 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1778 audit_mappings_page(vcpu,
1779 vcpu->arch.mmu.pae_root[i],
1780 i << 30,
1781 2);
1782}
1783
1784static int count_rmaps(struct kvm_vcpu *vcpu)
1785{
1786 int nmaps = 0;
1787 int i, j, k;
1788
1789 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1790 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1791 struct kvm_rmap_desc *d;
1792
1793 for (j = 0; j < m->npages; ++j) {
1794 unsigned long *rmapp = &m->rmap[j];
1795
1796 if (!*rmapp)
1797 continue;
1798 if (!(*rmapp & 1)) {
1799 ++nmaps;
1800 continue;
1801 }
1802 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1803 while (d) {
1804 for (k = 0; k < RMAP_EXT; ++k)
1805 if (d->shadow_ptes[k])
1806 ++nmaps;
1807 else
1808 break;
1809 d = d->more;
1810 }
1811 }
1812 }
1813 return nmaps;
1814}
1815
1816static int count_writable_mappings(struct kvm_vcpu *vcpu)
1817{
1818 int nmaps = 0;
1819 struct kvm_mmu_page *sp;
1820 int i;
1821
1822 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1823 u64 *pt = sp->spt;
1824
1825 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1826 continue;
1827
1828 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1829 u64 ent = pt[i];
1830
1831 if (!(ent & PT_PRESENT_MASK))
1832 continue;
1833 if (!(ent & PT_WRITABLE_MASK))
1834 continue;
1835 ++nmaps;
1836 }
1837 }
1838 return nmaps;
1839}
1840
1841static void audit_rmap(struct kvm_vcpu *vcpu)
1842{
1843 int n_rmap = count_rmaps(vcpu);
1844 int n_actual = count_writable_mappings(vcpu);
1845
1846 if (n_rmap != n_actual)
1847 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1848 __FUNCTION__, audit_msg, n_rmap, n_actual);
1849}
1850
1851static void audit_write_protection(struct kvm_vcpu *vcpu)
1852{
1853 struct kvm_mmu_page *sp;
1854 struct kvm_memory_slot *slot;
1855 unsigned long *rmapp;
1856 gfn_t gfn;
1857
1858 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1859 if (sp->role.metaphysical)
1860 continue;
1861
1862 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1863 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1864 rmapp = &slot->rmap[gfn - slot->base_gfn];
1865 if (*rmapp)
1866 printk(KERN_ERR "%s: (%s) shadow page has writable"
1867 " mappings: gfn %lx role %x\n",
1868 __FUNCTION__, audit_msg, sp->gfn,
1869 sp->role.word);
1870 }
1871}
1872
1873static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1874{
1875 int olddbg = dbg;
1876
1877 dbg = 0;
1878 audit_msg = msg;
1879 audit_rmap(vcpu);
1880 audit_write_protection(vcpu);
1881 audit_mappings(vcpu);
1882 dbg = olddbg;
1883}
1884
1885#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000000..1fce19ec7a23
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..03ba8608fe0f
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248 struct page *npage;
249
250 gpte = *(const pt_element_t *)pte;
251 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
252 if (!offset_in_pte && !is_present_pte(gpte))
253 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
254 return;
255 }
256 if (bytes < sizeof(pt_element_t))
257 return;
258 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
259 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
260 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
261 return;
262 npage = vcpu->arch.update_pte.page;
263 if (!npage)
264 return;
265 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
268}
269
270/*
271 * Fetch a shadow pte for a specific level in the paging hierarchy.
272 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite,
276 struct page *page)
277{
278 hpa_t shadow_addr;
279 int level;
280 u64 *shadow_ent;
281 unsigned access = walker->pt_access;
282
283 if (!is_present_pte(walker->ptes[walker->level - 1]))
284 return NULL;
285
286 shadow_addr = vcpu->arch.mmu.root_hpa;
287 level = vcpu->arch.mmu.shadow_root_level;
288 if (level == PT32E_ROOT_LEVEL) {
289 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
290 shadow_addr &= PT64_BASE_ADDR_MASK;
291 --level;
292 }
293
294 for (; ; level--) {
295 u32 index = SHADOW_PT_INDEX(addr, level);
296 struct kvm_mmu_page *shadow_page;
297 u64 shadow_pte;
298 int metaphysical;
299 gfn_t table_gfn;
300 bool new_page = 0;
301
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL)
304 break;
305 if (is_shadow_present_pte(*shadow_ent)) {
306 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
307 continue;
308 }
309
310 if (level - 1 == PT_PAGE_TABLE_LEVEL
311 && walker->level == PT_DIRECTORY_LEVEL) {
312 metaphysical = 1;
313 if (!is_dirty_pte(walker->ptes[level - 1]))
314 access &= ~ACC_WRITE_MASK;
315 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
316 } else {
317 metaphysical = 0;
318 table_gfn = walker->table_gfn[level - 2];
319 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access,
322 shadow_ent, &new_page);
323 if (new_page && !metaphysical) {
324 int r;
325 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm,
327 walker->pte_gpa[level - 2],
328 &curr_pte, sizeof(curr_pte));
329 if (r || curr_pte != walker->ptes[level - 2]) {
330 kvm_release_page_clean(page);
331 return NULL;
332 }
333 }
334 shadow_addr = __pa(shadow_page->spt);
335 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
336 | PT_WRITABLE_MASK | PT_USER_MASK;
337 *shadow_ent = shadow_pte;
338 }
339
340 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
341 user_fault, write_fault,
342 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
343 ptwrite, walker->gfn, page);
344
345 return shadow_ent;
346}
347
348/*
349 * Page fault handler. There are several causes for a page fault:
350 * - there is no shadow pte for the guest pte
351 * - write access through a shadow pte marked read only so that we can set
352 * the dirty bit
353 * - write access to a shadow pte marked read only so we can update the page
354 * dirty bitmap, when userspace requests it
355 * - mmio access; in this case we will never install a present shadow pte
356 * - normal guest page fault due to the guest pte marked not present, not
357 * writable, or not executable
358 *
359 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
360 * a negative value on error.
361 */
362static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
363 u32 error_code)
364{
365 int write_fault = error_code & PFERR_WRITE_MASK;
366 int user_fault = error_code & PFERR_USER_MASK;
367 int fetch_fault = error_code & PFERR_FETCH_MASK;
368 struct guest_walker walker;
369 u64 *shadow_pte;
370 int write_pt = 0;
371 int r;
372 struct page *page;
373
374 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
375 kvm_mmu_audit(vcpu, "pre page fault");
376
377 r = mmu_topup_memory_caches(vcpu);
378 if (r)
379 return r;
380
381 down_read(&current->mm->mmap_sem);
382 /*
383 * Look up the shadow pte for the faulting address.
384 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault);
387
388 /*
389 * The page is not mapped by the guest. Let the guest handle it.
390 */
391 if (!r) {
392 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem);
396 return 0;
397 }
398
399 page = gfn_to_page(vcpu->kvm, walker.gfn);
400
401 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu);
403 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
404 &write_pt, page);
405 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
406 shadow_pte, *shadow_pte, write_pt);
407
408 if (!write_pt)
409 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
410
411 /*
412 * mmio: emulate if accessible, otherwise its a guest fault.
413 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem);
417 return 1;
418 }
419
420 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem);
424
425 return write_pt;
426}
427
428static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
429{
430 struct guest_walker walker;
431 gpa_t gpa = UNMAPPED_GVA;
432 int r;
433
434 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
435
436 if (r) {
437 gpa = gfn_to_gpa(walker.gfn);
438 gpa |= vaddr & ~PAGE_MASK;
439 }
440
441 return gpa;
442}
443
444static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
445 struct kvm_mmu_page *sp)
446{
447 int i, offset = 0, r = 0;
448 pt_element_t pt;
449
450 if (sp->role.metaphysical
451 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
452 nonpaging_prefetch_page(vcpu, sp);
453 return;
454 }
455
456 if (PTTYPE == 32)
457 offset = sp->role.quadrant << PT64_LEVEL_BITS;
458
459 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
460 gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
461 pte_gpa += (i+offset) * sizeof(pt_element_t);
462
463 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
464 sizeof(pt_element_t));
465 if (r || is_present_pte(pt))
466 sp->spt[i] = shadow_trap_nonpresent_pte;
467 else
468 sp->spt[i] = shadow_notrap_nonpresent_pte;
469 }
470}
471
472#undef pt_element_t
473#undef guest_walker
474#undef FNAME
475#undef PT_BASE_ADDR_MASK
476#undef PT_INDEX
477#undef SHADOW_PT_INDEX
478#undef PT_LEVEL_MASK
479#undef PT_DIR_BASE_ADDR_MASK
480#undef PT_LEVEL_BITS
481#undef PT_MAX_FULL_LEVELS
482#undef gpte_to_gfn
483#undef gpte_to_gfn_pde
484#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
index 71fdf458619a..56fc4c873389 100644
--- a/drivers/kvm/segment_descriptor.h
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -1,3 +1,6 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
1struct segment_descriptor { 4struct segment_descriptor {
2 u16 limit_low; 5 u16 limit_low;
3 u16 base_low; 6 u16 base_low;
@@ -14,4 +17,13 @@ struct segment_descriptor {
14 u8 base_high; 17 u8 base_high;
15} __attribute__((packed)); 18} __attribute__((packed));
16 19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
17 27
28#endif
29#endif
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c
index 4e04e49a2f1c..de755cb1431d 100644
--- a/drivers/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -13,10 +13,11 @@
13 * the COPYING file in the top-level directory. 13 * the COPYING file in the top-level directory.
14 * 14 *
15 */ 15 */
16#include <linux/kvm_host.h>
16 17
17#include "kvm_svm.h" 18#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
20 21
21#include <linux/module.h> 22#include <linux/module.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
42#define SEG_TYPE_LDT 2 43#define SEG_TYPE_LDT 2
43#define SEG_TYPE_BUSY_TSS16 3 44#define SEG_TYPE_BUSY_TSS16 3
44 45
45#define KVM_EFER_LMA (1 << 10)
46#define KVM_EFER_LME (1 << 8)
47
48#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 48#define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
102 100
103static inline u8 pop_irq(struct kvm_vcpu *vcpu) 101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
104{ 102{
105 int word_index = __ffs(vcpu->irq_summary); 103 int word_index = __ffs(vcpu->arch.irq_summary);
106 int bit_index = __ffs(vcpu->irq_pending[word_index]); 104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
107 int irq = word_index * BITS_PER_LONG + bit_index; 105 int irq = word_index * BITS_PER_LONG + bit_index;
108 106
109 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
110 if (!vcpu->irq_pending[word_index]) 108 if (!vcpu->arch.irq_pending[word_index])
111 clear_bit(word_index, &vcpu->irq_summary); 109 clear_bit(word_index, &vcpu->arch.irq_summary);
112 return irq; 110 return irq;
113} 111}
114 112
115static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) 113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
116{ 114{
117 set_bit(irq, vcpu->irq_pending); 115 set_bit(irq, vcpu->arch.irq_pending);
118 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
119} 117}
120 118
121static inline void clgi(void) 119static inline void clgi(void)
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
184 182
185static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
186{ 184{
187 if (!(efer & KVM_EFER_LMA)) 185 if (!(efer & EFER_LMA))
188 efer &= ~KVM_EFER_LME; 186 efer &= ~EFER_LME;
189 187
190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
191 vcpu->shadow_efer = efer; 189 vcpu->arch.shadow_efer = efer;
192} 190}
193 191
194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
195{ 194{
196 struct vcpu_svm *svm = to_svm(vcpu); 195 struct vcpu_svm *svm = to_svm(vcpu);
197 196
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 197 svm->vmcb->control.event_inj = nr
199 SVM_EVTINJ_VALID_ERR | 198 | SVM_EVTINJ_VALID
200 SVM_EVTINJ_TYPE_EXEPT | 199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
201 GP_VECTOR; 200 | SVM_EVTINJ_TYPE_EXEPT;
202 svm->vmcb->control.event_inj_err = error_code; 201 svm->vmcb->control.event_inj_err = error_code;
203} 202}
204 203
205static void inject_ud(struct kvm_vcpu *vcpu) 204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
206{ 205{
207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | 206 struct vcpu_svm *svm = to_svm(vcpu);
208 SVM_EVTINJ_TYPE_EXEPT |
209 UD_VECTOR;
210}
211 207
212static int is_page_fault(uint32_t info) 208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
213{
214 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
215 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
216} 209}
217 210
218static int is_external_interrupt(u32 info) 211static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); 222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
230 return; 223 return;
231 } 224 }
232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { 225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
234 __FUNCTION__, 227 __FUNCTION__,
235 svm->vmcb->save.rip, 228 svm->vmcb->save.rip,
236 svm->next_rip); 229 svm->next_rip);
237 }
238 230
239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip; 231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
241 233
242 vcpu->interrupt_window_open = 1; 234 vcpu->arch.interrupt_window_open = 1;
243} 235}
244 236
245static int has_svm(void) 237static int has_svm(void)
@@ -290,7 +282,7 @@ static void svm_hardware_enable(void *garbage)
290#ifdef CONFIG_X86_64 282#ifdef CONFIG_X86_64
291 struct desc_ptr gdt_descr; 283 struct desc_ptr gdt_descr;
292#else 284#else
293 struct Xgt_desc_struct gdt_descr; 285 struct desc_ptr gdt_descr;
294#endif 286#endif
295 struct desc_struct *gdt; 287 struct desc_struct *gdt;
296 int me = raw_smp_processor_id(); 288 int me = raw_smp_processor_id();
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
312 svm_data->next_asid = svm_data->max_asid + 1; 304 svm_data->next_asid = svm_data->max_asid + 1;
313 svm_features = cpuid_edx(SVM_CPUID_FUNC); 305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
314 306
315 asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); 307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
316 gdt = (struct desc_struct *)gdt_descr.address; 308 gdt = (struct desc_struct *)gdt_descr.address;
317 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
318 310
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
458 450
459 control->intercept_cr_read = INTERCEPT_CR0_MASK | 451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
460 INTERCEPT_CR3_MASK | 452 INTERCEPT_CR3_MASK |
461 INTERCEPT_CR4_MASK; 453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
462 455
463 control->intercept_cr_write = INTERCEPT_CR0_MASK | 456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
464 INTERCEPT_CR3_MASK | 457 INTERCEPT_CR3_MASK |
465 INTERCEPT_CR4_MASK; 458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
466 460
467 control->intercept_dr_read = INTERCEPT_DR0_MASK | 461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
468 INTERCEPT_DR1_MASK | 462 INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
476 INTERCEPT_DR5_MASK | 470 INTERCEPT_DR5_MASK |
477 INTERCEPT_DR7_MASK; 471 INTERCEPT_DR7_MASK;
478 472
479 control->intercept_exceptions = 1 << PF_VECTOR; 473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
480 475
481 476
482 control->intercept = (1ULL << INTERCEPT_INTR) | 477 control->intercept = (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
543 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
544 539
545 save->efer = MSR_EFER_SVME_MASK; 540 save->efer = MSR_EFER_SVME_MASK;
546 541 save->dr6 = 0xffff0ff0;
547 save->dr6 = 0xffff0ff0;
548 save->dr7 = 0x400; 542 save->dr7 = 0x400;
549 save->rflags = 2; 543 save->rflags = 2;
550 save->rip = 0x0000fff0; 544 save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
558 /* rdx = ?? */ 552 /* rdx = ?? */
559} 553}
560 554
561static void svm_vcpu_reset(struct kvm_vcpu *vcpu) 555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
562{ 556{
563 struct vcpu_svm *svm = to_svm(vcpu); 557 struct vcpu_svm *svm = to_svm(vcpu);
564 558
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
566 560
567 if (vcpu->vcpu_id != 0) { 561 if (vcpu->vcpu_id != 0) {
568 svm->vmcb->save.rip = 0; 562 svm->vmcb->save.rip = 0;
569 svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; 563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
570 svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; 564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
571 } 565 }
566
567 return 0;
572} 568}
573 569
574static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
587 if (err) 583 if (err)
588 goto free_svm; 584 goto free_svm;
589 585
590 if (irqchip_in_kernel(kvm)) {
591 err = kvm_create_lapic(&svm->vcpu);
592 if (err < 0)
593 goto free_svm;
594 }
595
596 page = alloc_page(GFP_KERNEL); 586 page = alloc_page(GFP_KERNEL);
597 if (!page) { 587 if (!page) {
598 err = -ENOMEM; 588 err = -ENOMEM;
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
608 598
609 fx_init(&svm->vcpu); 599 fx_init(&svm->vcpu);
610 svm->vcpu.fpu_active = 1; 600 svm->vcpu.fpu_active = 1;
611 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
612 if (svm->vcpu.vcpu_id == 0) 602 if (svm->vcpu.vcpu_id == 0)
613 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; 603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
614 604
615 return &svm->vcpu; 605 return &svm->vcpu;
616 606
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
644 * increasing TSC. 634 * increasing TSC.
645 */ 635 */
646 rdtscll(tsc_this); 636 rdtscll(tsc_this);
647 delta = vcpu->host_tsc - tsc_this; 637 delta = vcpu->arch.host_tsc - tsc_this;
648 svm->vmcb->control.tsc_offset += delta; 638 svm->vmcb->control.tsc_offset += delta;
649 vcpu->cpu = cpu; 639 vcpu->cpu = cpu;
650 kvm_migrate_apic_timer(vcpu); 640 kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
659 struct vcpu_svm *svm = to_svm(vcpu); 649 struct vcpu_svm *svm = to_svm(vcpu);
660 int i; 650 int i;
661 651
652 ++vcpu->stat.host_state_reload;
662 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
663 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
664 655
665 rdtscll(vcpu->host_tsc); 656 rdtscll(vcpu->arch.host_tsc);
666 kvm_put_guest_fpu(vcpu);
667} 657}
668 658
669static void svm_vcpu_decache(struct kvm_vcpu *vcpu) 659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
674{ 664{
675 struct vcpu_svm *svm = to_svm(vcpu); 665 struct vcpu_svm *svm = to_svm(vcpu);
676 666
677 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
678 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
679 vcpu->rip = svm->vmcb->save.rip; 669 vcpu->arch.rip = svm->vmcb->save.rip;
680} 670}
681 671
682static void svm_decache_regs(struct kvm_vcpu *vcpu) 672static void svm_decache_regs(struct kvm_vcpu *vcpu)
683{ 673{
684 struct vcpu_svm *svm = to_svm(vcpu); 674 struct vcpu_svm *svm = to_svm(vcpu);
685 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; 675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
686 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; 676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
687 svm->vmcb->save.rip = vcpu->rip; 677 svm->vmcb->save.rip = vcpu->arch.rip;
688} 678}
689 679
690static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
782 struct vcpu_svm *svm = to_svm(vcpu); 772 struct vcpu_svm *svm = to_svm(vcpu);
783 773
784#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
785 if (vcpu->shadow_efer & KVM_EFER_LME) { 775 if (vcpu->arch.shadow_efer & EFER_LME) {
786 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
787 vcpu->shadow_efer |= KVM_EFER_LMA; 777 vcpu->arch.shadow_efer |= EFER_LMA;
788 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; 778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
789 } 779 }
790 780
791 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { 781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
792 vcpu->shadow_efer &= ~KVM_EFER_LMA; 782 vcpu->arch.shadow_efer &= ~EFER_LMA;
793 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); 783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
794 } 784 }
795 } 785 }
796#endif 786#endif
797 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
798 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
799 vcpu->fpu_active = 1; 789 vcpu->fpu_active = 1;
800 } 790 }
801 791
802 vcpu->cr0 = cr0; 792 vcpu->arch.cr0 = cr0;
803 cr0 |= X86_CR0_PG | X86_CR0_WP; 793 cr0 |= X86_CR0_PG | X86_CR0_WP;
804 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
805 svm->vmcb->save.cr0 = cr0; 795 svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
807 797
808static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
809{ 799{
810 vcpu->cr4 = cr4; 800 vcpu->arch.cr4 = cr4;
811 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; 801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
812} 802}
813 803
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
912 svm->db_regs[dr] = value; 902 svm->db_regs[dr] = value;
913 return; 903 return;
914 case 4 ... 5: 904 case 4 ... 5:
915 if (vcpu->cr4 & X86_CR4_DE) { 905 if (vcpu->arch.cr4 & X86_CR4_DE) {
916 *exception = UD_VECTOR; 906 *exception = UD_VECTOR;
917 return; 907 return;
918 } 908 }
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
938 struct kvm *kvm = svm->vcpu.kvm; 928 struct kvm *kvm = svm->vcpu.kvm;
939 u64 fault_address; 929 u64 fault_address;
940 u32 error_code; 930 u32 error_code;
941 enum emulation_result er;
942 int r;
943 931
944 if (!irqchip_in_kernel(kvm) && 932 if (!irqchip_in_kernel(kvm) &&
945 is_external_interrupt(exit_int_info)) 933 is_external_interrupt(exit_int_info))
946 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
947 935
948 mutex_lock(&kvm->lock);
949
950 fault_address = svm->vmcb->control.exit_info_2; 936 fault_address = svm->vmcb->control.exit_info_2;
951 error_code = svm->vmcb->control.exit_info_1; 937 error_code = svm->vmcb->control.exit_info_1;
952 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
953 if (r < 0) { 939}
954 mutex_unlock(&kvm->lock);
955 return r;
956 }
957 if (!r) {
958 mutex_unlock(&kvm->lock);
959 return 1;
960 }
961 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
962 error_code);
963 mutex_unlock(&kvm->lock);
964 940
965 switch (er) { 941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
966 case EMULATE_DONE: 942{
967 return 1; 943 int er;
968 case EMULATE_DO_MMIO:
969 ++svm->vcpu.stat.mmio_exits;
970 return 0;
971 case EMULATE_FAIL:
972 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
973 break;
974 default:
975 BUG();
976 }
977 944
978 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
979 return 0; 946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
980} 949}
981 950
982static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
983{ 952{
984 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
985 if (!(svm->vcpu.cr0 & X86_CR0_TS)) 954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
986 svm->vmcb->save.cr0 &= ~X86_CR0_TS; 955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
987 svm->vcpu.fpu_active = 1; 956 svm->vcpu.fpu_active = 1;
988 957
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1004 973
1005static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1006{ 975{
1007 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? 976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1008 int size, down, in, string, rep; 977 int size, down, in, string, rep;
1009 unsigned port; 978 unsigned port;
1010 979
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015 string = (io_info & SVM_IOIO_STR_MASK) != 0; 984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1016 985
1017 if (string) { 986 if (string) {
1018 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1019 return 0; 989 return 0;
1020 return 1; 990 return 1;
1021 } 991 }
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1045{ 1015{
1046 svm->next_rip = svm->vmcb->save.rip + 3; 1016 svm->next_rip = svm->vmcb->save.rip + 3;
1047 skip_emulated_instruction(&svm->vcpu); 1017 skip_emulated_instruction(&svm->vcpu);
1048 return kvm_hypercall(&svm->vcpu, kvm_run); 1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1049} 1020}
1050 1021
1051static int invalid_op_interception(struct vcpu_svm *svm, 1022static int invalid_op_interception(struct vcpu_svm *svm,
1052 struct kvm_run *kvm_run) 1023 struct kvm_run *kvm_run)
1053{ 1024{
1054 inject_ud(&svm->vcpu); 1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1055 return 1; 1026 return 1;
1056} 1027}
1057 1028
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1073static int emulate_on_interception(struct vcpu_svm *svm, 1044static int emulate_on_interception(struct vcpu_svm *svm,
1074 struct kvm_run *kvm_run) 1045 struct kvm_run *kvm_run)
1075{ 1046{
1076 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) 1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1077 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); 1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1078 return 1; 1049 return 1;
1079} 1050}
1080 1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1081static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1082{ 1062{
1083 struct vcpu_svm *svm = to_svm(vcpu); 1063 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1124 1104
1125static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1126{ 1106{
1127 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1128 u64 data; 1108 u64 data;
1129 1109
1130 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1131 svm_inject_gp(&svm->vcpu, 0); 1111 kvm_inject_gp(&svm->vcpu, 0);
1132 else { 1112 else {
1133 svm->vmcb->save.rax = data & 0xffffffff; 1113 svm->vmcb->save.rax = data & 0xffffffff;
1134 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; 1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1135 svm->next_rip = svm->vmcb->save.rip + 2; 1115 svm->next_rip = svm->vmcb->save.rip + 2;
1136 skip_emulated_instruction(&svm->vcpu); 1116 skip_emulated_instruction(&svm->vcpu);
1137 } 1117 }
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1176 case MSR_IA32_SYSENTER_ESP: 1156 case MSR_IA32_SYSENTER_ESP:
1177 svm->vmcb->save.sysenter_esp = data; 1157 svm->vmcb->save.sysenter_esp = data;
1178 break; 1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1179 default: 1171 default:
1172 unhandled:
1180 return kvm_set_msr_common(vcpu, ecx, data); 1173 return kvm_set_msr_common(vcpu, ecx, data);
1181 } 1174 }
1182 return 0; 1175 return 0;
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1184 1177
1185static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1186{ 1179{
1187 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1188 u64 data = (svm->vmcb->save.rax & -1u) 1181 u64 data = (svm->vmcb->save.rax & -1u)
1189 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); 1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1190 svm->next_rip = svm->vmcb->save.rip + 2; 1183 svm->next_rip = svm->vmcb->save.rip + 2;
1191 if (svm_set_msr(&svm->vcpu, ecx, data)) 1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1192 svm_inject_gp(&svm->vcpu, 0); 1185 kvm_inject_gp(&svm->vcpu, 0);
1193 else 1186 else
1194 skip_emulated_instruction(&svm->vcpu); 1187 skip_emulated_instruction(&svm->vcpu);
1195 return 1; 1188 return 1;
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
1213 * possible 1206 * possible
1214 */ 1207 */
1215 if (kvm_run->request_interrupt_window && 1208 if (kvm_run->request_interrupt_window &&
1216 !svm->vcpu.irq_summary) { 1209 !svm->vcpu.arch.irq_summary) {
1217 ++svm->vcpu.stat.irq_window_exits; 1210 ++svm->vcpu.stat.irq_window_exits;
1218 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1219 return 0; 1212 return 0;
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1227 [SVM_EXIT_READ_CR0] = emulate_on_interception, 1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1228 [SVM_EXIT_READ_CR3] = emulate_on_interception, 1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1229 [SVM_EXIT_READ_CR4] = emulate_on_interception, 1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1230 /* for now: */ 1224 /* for now: */
1231 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1232 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1234 [SVM_EXIT_READ_DR0] = emulate_on_interception, 1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1235 [SVM_EXIT_READ_DR1] = emulate_on_interception, 1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1236 [SVM_EXIT_READ_DR2] = emulate_on_interception, 1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1241 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1244 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1245 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1246 [SVM_EXIT_INTR] = nop_on_interception, 1242 [SVM_EXIT_INTR] = nop_on_interception,
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1293 exit_code); 1289 exit_code);
1294 1290
1295 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1296 || svm_exit_handlers[exit_code] == 0) { 1292 || !svm_exit_handlers[exit_code]) {
1297 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1298 kvm_run->hw.hardware_exit_reason = exit_code; 1294 kvm_run->hw.hardware_exit_reason = exit_code;
1299 return 0; 1295 return 0;
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
1307 int cpu = raw_smp_processor_id(); 1303 int cpu = raw_smp_processor_id();
1308 1304
1309 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1310 svm_data->tss_desc->type = 9; //available 32/64-bit TSS 1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1311 load_TR_desc(); 1307 load_TR_desc();
1312} 1308}
1313 1309
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1348 struct vmcb *vmcb = svm->vmcb; 1344 struct vmcb *vmcb = svm->vmcb;
1349 int intr_vector = -1; 1345 int intr_vector = -1;
1350 1346
1351 kvm_inject_pending_timer_irqs(vcpu);
1352 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && 1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1353 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { 1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1354 intr_vector = vmcb->control.exit_int_info & 1349 intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
1388 push_irq(&svm->vcpu, control->int_vector); 1383 push_irq(&svm->vcpu, control->int_vector);
1389 } 1384 }
1390 1385
1391 svm->vcpu.interrupt_window_open = 1386 svm->vcpu.arch.interrupt_window_open =
1392 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); 1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1393} 1388}
1394 1389
1395static void svm_do_inject_vector(struct vcpu_svm *svm) 1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1396{ 1391{
1397 struct kvm_vcpu *vcpu = &svm->vcpu; 1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1398 int word_index = __ffs(vcpu->irq_summary); 1393 int word_index = __ffs(vcpu->arch.irq_summary);
1399 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1400 int irq = word_index * BITS_PER_LONG + bit_index; 1395 int irq = word_index * BITS_PER_LONG + bit_index;
1401 1396
1402 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1403 if (!vcpu->irq_pending[word_index]) 1398 if (!vcpu->arch.irq_pending[word_index])
1404 clear_bit(word_index, &vcpu->irq_summary); 1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1405 svm_inject_irq(svm, irq); 1400 svm_inject_irq(svm, irq);
1406} 1401}
1407 1402
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1411 struct vcpu_svm *svm = to_svm(vcpu); 1406 struct vcpu_svm *svm = to_svm(vcpu);
1412 struct vmcb_control_area *control = &svm->vmcb->control; 1407 struct vmcb_control_area *control = &svm->vmcb->control;
1413 1408
1414 svm->vcpu.interrupt_window_open = 1409 svm->vcpu.arch.interrupt_window_open =
1415 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1416 (svm->vmcb->save.rflags & X86_EFLAGS_IF)); 1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1417 1412
1418 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) 1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1419 /* 1414 /*
1420 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1421 */ 1416 */
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1424 /* 1419 /*
1425 * Interrupts blocked. Wait for unblock. 1420 * Interrupts blocked. Wait for unblock.
1426 */ 1421 */
1427 if (!svm->vcpu.interrupt_window_open && 1422 if (!svm->vcpu.arch.interrupt_window_open &&
1428 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { 1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1429 control->intercept |= 1ULL << INTERCEPT_VINTR; 1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1430 } else 1425 else
1431 control->intercept &= ~(1ULL << INTERCEPT_VINTR); 1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1432} 1427}
1433 1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs) 1434static void save_db_regs(unsigned long *db_regs)
1435{ 1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); 1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1472 svm->host_cr2 = kvm_read_cr2(); 1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6(); 1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7(); 1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->cr2; 1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476 1476
1477 if (svm->vmcb->save.dr7 & 0xff) { 1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0); 1478 write_dr7(0);
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1486 1486
1487 asm volatile ( 1487 asm volatile (
1488#ifdef CONFIG_X86_64 1488#ifdef CONFIG_X86_64
1489 "push %%rbx; push %%rcx; push %%rdx;" 1489 "push %%rbp; \n\t"
1490 "push %%rsi; push %%rdi; push %%rbp;"
1491 "push %%r8; push %%r9; push %%r10; push %%r11;"
1492 "push %%r12; push %%r13; push %%r14; push %%r15;"
1493#else 1490#else
1494 "push %%ebx; push %%ecx; push %%edx;" 1491 "push %%ebp; \n\t"
1495 "push %%esi; push %%edi; push %%ebp;"
1496#endif 1492#endif
1497 1493
1498#ifdef CONFIG_X86_64 1494#ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1554 "mov %%r14, %c[r14](%[svm]) \n\t" 1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1555 "mov %%r15, %c[r15](%[svm]) \n\t" 1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1556 1552
1557 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 1553 "pop %%rbp; \n\t"
1558 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1559 "pop %%rbp; pop %%rdi; pop %%rsi;"
1560 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1561#else 1554#else
1562 "mov %%ebx, %c[rbx](%[svm]) \n\t" 1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1563 "mov %%ecx, %c[rcx](%[svm]) \n\t" 1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1566 "mov %%edi, %c[rdi](%[svm]) \n\t" 1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1567 "mov %%ebp, %c[rbp](%[svm]) \n\t" 1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1568 1561
1569 "pop %%ebp; pop %%edi; pop %%esi;" 1562 "pop %%ebp; \n\t"
1570 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1571#endif 1563#endif
1572 : 1564 :
1573 : [svm]"a"(svm), 1565 : [svm]"a"(svm),
1574 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1575 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), 1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1576 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), 1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1577 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), 1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1578 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), 1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1579 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), 1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1580 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) 1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1581#ifdef CONFIG_X86_64 1573#ifdef CONFIG_X86_64
1582 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), 1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1583 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), 1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1584 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), 1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1585 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), 1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1586 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), 1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1587 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), 1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1588 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), 1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1589 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) 1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1590#endif 1582#endif
1591 : "cc", "memory" ); 1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1592 1591
1593 if ((svm->vmcb->save.dr7 & 0xff)) 1592 if ((svm->vmcb->save.dr7 & 0xff))
1594 load_db_regs(svm->host_db_regs); 1593 load_db_regs(svm->host_db_regs);
1595 1594
1596 vcpu->cr2 = svm->vmcb->save.cr2; 1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1597 1596
1598 write_dr6(svm->host_dr6); 1597 write_dr6(svm->host_dr6);
1599 write_dr7(svm->host_dr7); 1598 write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1627 } 1626 }
1628} 1627}
1629 1628
1630static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1631 unsigned long addr,
1632 uint32_t err_code)
1633{
1634 struct vcpu_svm *svm = to_svm(vcpu);
1635 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1636
1637 ++vcpu->stat.pf_guest;
1638
1639 if (is_page_fault(exit_int_info)) {
1640
1641 svm->vmcb->control.event_inj_err = 0;
1642 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1643 SVM_EVTINJ_VALID_ERR |
1644 SVM_EVTINJ_TYPE_EXEPT |
1645 DF_VECTOR;
1646 return;
1647 }
1648 vcpu->cr2 = addr;
1649 svm->vmcb->save.cr2 = addr;
1650 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1651 SVM_EVTINJ_VALID_ERR |
1652 SVM_EVTINJ_TYPE_EXEPT |
1653 PF_VECTOR;
1654 svm->vmcb->control.event_inj_err = err_code;
1655}
1656
1657
1658static int is_disabled(void) 1629static int is_disabled(void)
1659{ 1630{
1660 u64 vm_cr; 1631 u64 vm_cr;
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1675 hypercall[0] = 0x0f; 1646 hypercall[0] = 0x0f;
1676 hypercall[1] = 0x01; 1647 hypercall[1] = 0x01;
1677 hypercall[2] = 0xd9; 1648 hypercall[2] = 0xd9;
1678 hypercall[3] = 0xc3;
1679} 1649}
1680 1650
1681static void svm_check_processor_compat(void *rtn) 1651static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
1683 *(int *)rtn = 0; 1653 *(int *)rtn = 0;
1684} 1654}
1685 1655
1656static bool svm_cpu_has_accelerated_tpr(void)
1657{
1658 return false;
1659}
1660
1686static struct kvm_x86_ops svm_x86_ops = { 1661static struct kvm_x86_ops svm_x86_ops = {
1687 .cpu_has_kvm_support = has_svm, 1662 .cpu_has_kvm_support = has_svm,
1688 .disabled_by_bios = is_disabled, 1663 .disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1691 .check_processor_compatibility = svm_check_processor_compat, 1666 .check_processor_compatibility = svm_check_processor_compat,
1692 .hardware_enable = svm_hardware_enable, 1667 .hardware_enable = svm_hardware_enable,
1693 .hardware_disable = svm_hardware_disable, 1668 .hardware_disable = svm_hardware_disable,
1669 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
1694 1670
1695 .vcpu_create = svm_create_vcpu, 1671 .vcpu_create = svm_create_vcpu,
1696 .vcpu_free = svm_free_vcpu, 1672 .vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1725 .set_rflags = svm_set_rflags, 1701 .set_rflags = svm_set_rflags,
1726 1702
1727 .tlb_flush = svm_flush_tlb, 1703 .tlb_flush = svm_flush_tlb,
1728 .inject_page_fault = svm_inject_page_fault,
1729
1730 .inject_gp = svm_inject_gp,
1731 1704
1732 .run = svm_vcpu_run, 1705 .run = svm_vcpu_run,
1733 .handle_exit = handle_exit, 1706 .handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
1735 .patch_hypercall = svm_patch_hypercall, 1708 .patch_hypercall = svm_patch_hypercall,
1736 .get_irq = svm_get_irq, 1709 .get_irq = svm_get_irq,
1737 .set_irq = svm_set_irq, 1710 .set_irq = svm_set_irq,
1711 .queue_exception = svm_queue_exception,
1712 .exception_injected = svm_exception_injected,
1738 .inject_pending_irq = svm_intr_assist, 1713 .inject_pending_irq = svm_intr_assist,
1739 .inject_pending_vectors = do_interrupt_requests, 1714 .inject_pending_vectors = do_interrupt_requests,
1715
1716 .set_tss_addr = svm_set_tss_addr,
1740}; 1717};
1741 1718
1742static int __init svm_init(void) 1719static int __init svm_init(void)
1743{ 1720{
1744 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), 1721 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1745 THIS_MODULE); 1722 THIS_MODULE);
1746} 1723}
1747 1724
1748static void __exit svm_exit(void) 1725static void __exit svm_exit(void)
1749{ 1726{
1750 kvm_exit_x86(); 1727 kvm_exit();
1751} 1728}
1752 1729
1753module_init(svm_init) 1730module_init(svm_init)
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h
index 3b1b0f35b6cb..5fd50491b555 100644
--- a/drivers/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
204#define INTERCEPT_CR0_MASK 1 204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3) 205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4) 206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
207 208
208#define INTERCEPT_DR0_MASK 1 209#define INTERCEPT_DR0_MASK 1
209#define INTERCEPT_DR1_MASK (1 << 1) 210#define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
311 312
312#define SVM_EXIT_ERR -1 313#define SVM_EXIT_ERR -1
313 314
314#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP 315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
315 316
316#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
317#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb56ae3f89b6..ad36447e696e 100644
--- a/drivers/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -15,17 +15,18 @@
15 * 15 *
16 */ 16 */
17 17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h" 18#include "irq.h"
21#include "vmx.h" 19#include "vmx.h"
22#include "segment_descriptor.h" 20#include "segment_descriptor.h"
21#include "mmu.h"
23 22
23#include <linux/kvm_host.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -33,6 +34,9 @@
33MODULE_AUTHOR("Qumranet"); 34MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
35 36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
36struct vmcs { 40struct vmcs {
37 u32 revision_id; 41 u32 revision_id;
38 u32 abort; 42 u32 abort;
@@ -43,6 +47,7 @@ struct vcpu_vmx {
43 struct kvm_vcpu vcpu; 47 struct kvm_vcpu vcpu;
44 int launched; 48 int launched;
45 u8 fail; 49 u8 fail;
50 u32 idt_vectoring_info;
46 struct kvm_msr_entry *guest_msrs; 51 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs; 52 struct kvm_msr_entry *host_msrs;
48 int nmsrs; 53 int nmsrs;
@@ -57,8 +62,15 @@ struct vcpu_vmx {
57 u16 fs_sel, gs_sel, ldt_sel; 62 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed; 63 int gs_ldt_reload_needed;
59 int fs_reload_needed; 64 int fs_reload_needed;
60 }host_state; 65 int guest_efer_loaded;
61 66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
62}; 74};
63 75
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
74static struct page *vmx_io_bitmap_a; 86static struct page *vmx_io_bitmap_a;
75static struct page *vmx_io_bitmap_b; 87static struct page *vmx_io_bitmap_b;
76 88
77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
78
79static struct vmcs_config { 89static struct vmcs_config {
80 int size; 90 int size;
81 int order; 91 int order;
82 u32 revision_id; 92 u32 revision_id;
83 u32 pin_based_exec_ctrl; 93 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl; 94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
85 u32 vmexit_ctrl; 96 u32 vmexit_ctrl;
86 u32 vmentry_ctrl; 97 u32 vmentry_ctrl;
87} vmcs_config; 98} vmcs_config;
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
138 rdmsrl(e[i].index, e[i].data); 149 rdmsrl(e[i].index, e[i].data);
139} 150}
140 151
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
142{
143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
144}
145
146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
147{
148 int efer_offset = vmx->msr_offset_efer;
149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
151}
152
153static inline int is_page_fault(u32 intr_info) 152static inline int is_page_fault(u32 intr_info)
154{ 153{
155 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
164 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
165} 164}
166 165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
167static inline int is_external_interrupt(u32 intr_info) 173static inline int is_external_interrupt(u32 intr_info)
168{ 174{
169 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181} 187}
182 188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
184{ 208{
185 int i; 209 int i;
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
222 vmcs_clear(vmx->vmcs); 246 vmcs_clear(vmx->vmcs);
223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
224 per_cpu(current_vmcs, cpu) = NULL; 248 per_cpu(current_vmcs, cpu) = NULL;
225 rdtscll(vmx->vcpu.host_tsc); 249 rdtscll(vmx->vcpu.arch.host_tsc);
226} 250}
227 251
228static void vcpu_clear(struct vcpu_vmx *vmx) 252static void vcpu_clear(struct vcpu_vmx *vmx)
229{ 253{
230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) 254 if (vmx->vcpu.cpu == -1)
231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, 255 return;
232 vmx, 0, 1); 256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
233 else
234 __vcpu_clear(vmx);
235 vmx->launched = 0; 257 vmx->launched = 0;
236} 258}
237 259
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
275 u8 error; 297 u8 error;
276 298
277 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
278 : "=q"(error) : "a"(value), "d"(field) : "cc" ); 300 : "=q"(error) : "a"(value), "d"(field) : "cc");
279 if (unlikely(error)) 301 if (unlikely(error))
280 vmwrite_error(field, value); 302 vmwrite_error(field, value);
281} 303}
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
315{ 337{
316 u32 eb; 338 u32 eb;
317 339
318 eb = 1u << PF_VECTOR; 340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
319 if (!vcpu->fpu_active) 341 if (!vcpu->fpu_active)
320 eb |= 1u << NM_VECTOR; 342 eb |= 1u << NM_VECTOR;
321 if (vcpu->guest_debug.enabled) 343 if (vcpu->guest_debug.enabled)
322 eb |= 1u << 1; 344 eb |= 1u << 1;
323 if (vcpu->rmode.active) 345 if (vcpu->arch.rmode.active)
324 eb = ~0; 346 eb = ~0;
325 vmcs_write32(EXCEPTION_BITMAP, eb); 347 vmcs_write32(EXCEPTION_BITMAP, eb);
326} 348}
@@ -344,16 +366,42 @@ static void reload_tss(void)
344 366
345static void load_transition_efer(struct vcpu_vmx *vmx) 367static void load_transition_efer(struct vcpu_vmx *vmx)
346{ 368{
347 u64 trans_efer;
348 int efer_offset = vmx->msr_offset_efer; 369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
349 373
350 trans_efer = vmx->host_msrs[efer_offset].data; 374 if (efer_offset < 0)
351 trans_efer &= ~EFER_SAVE_RESTORE_BITS; 375 return;
352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); 376 /*
353 wrmsrl(MSR_EFER, trans_efer); 377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
354 vmx->vcpu.stat.efer_reload++; 394 vmx->vcpu.stat.efer_reload++;
355} 395}
356 396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
357static void vmx_save_host_state(struct kvm_vcpu *vcpu) 405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
358{ 406{
359 struct vcpu_vmx *vmx = to_vmx(vcpu); 407 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
393#endif 441#endif
394 442
395#ifdef CONFIG_X86_64 443#ifdef CONFIG_X86_64
396 if (is_long_mode(&vmx->vcpu)) { 444 if (is_long_mode(&vmx->vcpu))
397 save_msrs(vmx->host_msrs + 445 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1); 446 vmx->msr_offset_kernel_gs_base, 1);
399 } 447
400#endif 448#endif
401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
402 if (msr_efer_need_save_restore(vmx)) 450 load_transition_efer(vmx);
403 load_transition_efer(vmx);
404} 451}
405 452
406static void vmx_load_host_state(struct vcpu_vmx *vmx) 453static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
410 if (!vmx->host_state.loaded) 457 if (!vmx->host_state.loaded)
411 return; 458 return;
412 459
460 ++vmx->vcpu.stat.host_state_reload;
413 vmx->host_state.loaded = 0; 461 vmx->host_state.loaded = 0;
414 if (vmx->host_state.fs_reload_needed) 462 if (vmx->host_state.fs_reload_needed)
415 load_fs(vmx->host_state.fs_sel); 463 load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
429 reload_tss(); 477 reload_tss();
430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
431 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
432 if (msr_efer_need_save_restore(vmx)) 480 reload_host_efer(vmx);
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
434} 481}
435 482
436/* 483/*
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
480 * Make sure the time stamp counter is monotonous. 527 * Make sure the time stamp counter is monotonous.
481 */ 528 */
482 rdtscll(tsc_this); 529 rdtscll(tsc_this);
483 delta = vcpu->host_tsc - tsc_this; 530 delta = vcpu->arch.host_tsc - tsc_this;
484 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); 531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
485 } 532 }
486} 533}
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
489{ 536{
490 vmx_load_host_state(to_vmx(vcpu)); 537 vmx_load_host_state(to_vmx(vcpu));
491 kvm_put_guest_fpu(vcpu);
492} 538}
493 539
494static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
497 return; 543 return;
498 vcpu->fpu_active = 1; 544 vcpu->fpu_active = 1;
499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
500 if (vcpu->cr0 & X86_CR0_TS) 546 if (vcpu->arch.cr0 & X86_CR0_TS)
501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
502 update_exception_bitmap(vcpu); 548 update_exception_bitmap(vcpu);
503} 549}
@@ -523,8 +569,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
523 569
524static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
525{ 571{
526 if (vcpu->rmode.active) 572 if (vcpu->arch.rmode.active)
527 rflags |= IOPL_MASK | X86_EFLAGS_VM; 573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
528 vmcs_writel(GUEST_RFLAGS, rflags); 574 vmcs_writel(GUEST_RFLAGS, rflags);
529} 575}
530 576
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
545 if (interruptibility & 3) 591 if (interruptibility & 3)
546 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
547 interruptibility & ~3); 593 interruptibility & ~3);
548 vcpu->interrupt_window_open = 1; 594 vcpu->arch.interrupt_window_open = 1;
549} 595}
550 596
551static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
552{ 599{
553 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
554 vmcs_readl(GUEST_RIP));
555 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
556 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
557 GP_VECTOR | 601 nr | INTR_TYPE_EXCEPTION
558 INTR_TYPE_EXCEPTION | 602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
559 INTR_INFO_DELIEVER_CODE_MASK | 603 | INTR_INFO_VALID_MASK);
560 INTR_INFO_VALID_MASK); 604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
561} 613}
562 614
563/* 615/*
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
608 * if efer.sce is enabled. 660 * if efer.sce is enabled.
609 */ 661 */
610 index = __find_msr_index(vmx, MSR_K6_STAR); 662 index = __find_msr_index(vmx, MSR_K6_STAR);
611 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) 663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
612 move_msr_up(vmx, index, save_nmsrs++); 664 move_msr_up(vmx, index, save_nmsrs++);
613 } 665 }
614#endif 666#endif
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
712#ifdef CONFIG_X86_64 764#ifdef CONFIG_X86_64
713 case MSR_EFER: 765 case MSR_EFER:
714 ret = kvm_set_msr_common(vcpu, msr_index, data); 766 ret = kvm_set_msr_common(vcpu, msr_index, data);
715 if (vmx->host_state.loaded) 767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
716 load_transition_efer(vmx); 769 load_transition_efer(vmx);
770 }
717 break; 771 break;
718 case MSR_FS_BASE: 772 case MSR_FS_BASE:
719 vmcs_writel(GUEST_FS_BASE, data); 773 vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
750 804
751/* 805/*
752 * Sync the rsp and rip registers into the vcpu structure. This allows 806 * Sync the rsp and rip registers into the vcpu structure. This allows
753 * registers to be accessed by indexing vcpu->regs. 807 * registers to be accessed by indexing vcpu->arch.regs.
754 */ 808 */
755static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) 809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
756{ 810{
757 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
758 vcpu->rip = vmcs_readl(GUEST_RIP); 812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
759} 813}
760 814
761/* 815/*
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
764 */ 818 */
765static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) 819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
766{ 820{
767 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); 821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
768 vmcs_writel(GUEST_RIP, vcpu->rip); 822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
769} 823}
770 824
771static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
808 862
809static int vmx_get_irq(struct kvm_vcpu *vcpu) 863static int vmx_get_irq(struct kvm_vcpu *vcpu)
810{ 864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
811 u32 idtv_info_field; 866 u32 idtv_info_field;
812 867
813 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 868 idtv_info_field = vmx->idt_vectoring_info;
814 if (idtv_info_field & INTR_INFO_VALID_MASK) { 869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
815 if (is_external_interrupt(idtv_info_field)) 870 if (is_external_interrupt(idtv_info_field))
816 return idtv_info_field & VECTORING_INFO_VECTOR_MASK; 871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
817 else 872 else
818 printk("pending exception: not handled yet\n"); 873 printk(KERN_DEBUG "pending exception: not handled yet\n");
819 } 874 }
820 return -1; 875 return -1;
821} 876}
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
863} 918}
864 919
865static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
866 u32 msr, u32* result) 921 u32 msr, u32 *result)
867{ 922{
868 u32 vmx_msr_low, vmx_msr_high; 923 u32 vmx_msr_low, vmx_msr_high;
869 u32 ctl = ctl_min | ctl_opt; 924 u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
887 u32 min, opt; 942 u32 min, opt;
888 u32 _pin_based_exec_control = 0; 943 u32 _pin_based_exec_control = 0;
889 u32 _cpu_based_exec_control = 0; 944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
890 u32 _vmexit_control = 0; 946 u32 _vmexit_control = 0;
891 u32 _vmentry_control = 0; 947 u32 _vmentry_control = 0;
892 948
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
904 CPU_BASED_USE_IO_BITMAPS | 960 CPU_BASED_USE_IO_BITMAPS |
905 CPU_BASED_MOV_DR_EXITING | 961 CPU_BASED_MOV_DR_EXITING |
906 CPU_BASED_USE_TSC_OFFSETING; 962 CPU_BASED_USE_TSC_OFFSETING;
907#ifdef CONFIG_X86_64 963 opt = CPU_BASED_TPR_SHADOW |
908 opt = CPU_BASED_TPR_SHADOW; 964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
909#else
910 opt = 0;
911#endif
912 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
913 &_cpu_based_exec_control) < 0) 966 &_cpu_based_exec_control) < 0)
914 return -EIO; 967 return -EIO;
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
917 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
918 ~CPU_BASED_CR8_STORE_EXITING; 971 ~CPU_BASED_CR8_STORE_EXITING;
919#endif 972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
920 986
921 min = 0; 987 min = 0;
922#ifdef CONFIG_X86_64 988#ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
954 1020
955 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
956 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
957 vmcs_conf->vmexit_ctrl = _vmexit_control; 1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
958 vmcs_conf->vmentry_ctrl = _vmentry_control; 1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
959 1026
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1043{ 1110{
1044 unsigned long flags; 1111 unsigned long flags;
1045 1112
1046 vcpu->rmode.active = 0; 1113 vcpu->arch.rmode.active = 0;
1047 1114
1048 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); 1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1049 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); 1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1050 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); 1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1051 1118
1052 flags = vmcs_readl(GUEST_RFLAGS); 1119 flags = vmcs_readl(GUEST_RFLAGS);
1053 flags &= ~(IOPL_MASK | X86_EFLAGS_VM); 1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1054 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); 1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1055 vmcs_writel(GUEST_RFLAGS, flags); 1122 vmcs_writel(GUEST_RFLAGS, flags);
1056 1123
1057 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1059 1126
1060 update_exception_bitmap(vcpu); 1127 update_exception_bitmap(vcpu);
1061 1128
1062 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); 1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1063 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); 1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1064 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); 1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1065 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); 1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1066 1133
1067 vmcs_write16(GUEST_SS_SELECTOR, 0); 1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1068 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1072 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1073} 1140}
1074 1141
1075static gva_t rmode_tss_base(struct kvm* kvm) 1142static gva_t rmode_tss_base(struct kvm *kvm)
1076{ 1143{
1077 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; 1144 if (!kvm->arch.tss_addr) {
1078 return base_gfn << PAGE_SHIFT; 1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1079} 1150}
1080 1151
1081static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1086 save->base = vmcs_readl(sf->base); 1157 save->base = vmcs_readl(sf->base);
1087 save->limit = vmcs_read32(sf->limit); 1158 save->limit = vmcs_read32(sf->limit);
1088 save->ar = vmcs_read32(sf->ar_bytes); 1159 save->ar = vmcs_read32(sf->ar_bytes);
1089 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); 1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1090 vmcs_write32(sf->limit, 0xffff); 1162 vmcs_write32(sf->limit, 0xffff);
1091 vmcs_write32(sf->ar_bytes, 0xf3); 1163 vmcs_write32(sf->ar_bytes, 0xf3);
1092} 1164}
@@ -1095,21 +1167,22 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1095{ 1167{
1096 unsigned long flags; 1168 unsigned long flags;
1097 1169
1098 vcpu->rmode.active = 1; 1170 vcpu->arch.rmode.active = 1;
1099 1171
1100 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1101 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1102 1174
1103 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1104 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1105 1177
1106 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1107 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1108 1180
1109 flags = vmcs_readl(GUEST_RFLAGS); 1181 flags = vmcs_readl(GUEST_RFLAGS);
1110 vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT; 1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1111 1184
1112 flags |= IOPL_MASK | X86_EFLAGS_VM; 1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1113 1186
1114 vmcs_writel(GUEST_RFLAGS, flags); 1187 vmcs_writel(GUEST_RFLAGS, flags);
1115 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1188 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1125 vmcs_writel(GUEST_CS_BASE, 0xf0000); 1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1126 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1127 1200
1128 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); 1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1129 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); 1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1132 1205
1133 kvm_mmu_reset_context(vcpu); 1206 kvm_mmu_reset_context(vcpu);
1134 init_rmode_tss(vcpu->kvm); 1207 init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1149 | AR_TYPE_BUSY_64_TSS); 1222 | AR_TYPE_BUSY_64_TSS);
1150 } 1223 }
1151 1224
1152 vcpu->shadow_efer |= EFER_LMA; 1225 vcpu->arch.shadow_efer |= EFER_LMA;
1153 1226
1154 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; 1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1155 vmcs_write32(VM_ENTRY_CONTROLS, 1228 vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1159 1232
1160static void exit_lmode(struct kvm_vcpu *vcpu) 1233static void exit_lmode(struct kvm_vcpu *vcpu)
1161{ 1234{
1162 vcpu->shadow_efer &= ~EFER_LMA; 1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1163 1236
1164 vmcs_write32(VM_ENTRY_CONTROLS, 1237 vmcs_write32(VM_ENTRY_CONTROLS,
1165 vmcs_read32(VM_ENTRY_CONTROLS) 1238 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1170 1243
1171static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1245{
1173 vcpu->cr4 &= KVM_GUEST_CR4_MASK; 1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1174 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1175} 1248}
1176 1249
1177static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1178{ 1251{
1179 vmx_fpu_deactivate(vcpu); 1252 vmx_fpu_deactivate(vcpu);
1180 1253
1181 if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) 1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1182 enter_pmode(vcpu); 1255 enter_pmode(vcpu);
1183 1256
1184 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) 1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1185 enter_rmode(vcpu); 1258 enter_rmode(vcpu);
1186 1259
1187#ifdef CONFIG_X86_64 1260#ifdef CONFIG_X86_64
1188 if (vcpu->shadow_efer & EFER_LME) { 1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1189 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1190 enter_lmode(vcpu); 1263 enter_lmode(vcpu);
1191 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1196 vmcs_writel(CR0_READ_SHADOW, cr0); 1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1197 vmcs_writel(GUEST_CR0, 1270 vmcs_writel(GUEST_CR0,
1198 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1199 vcpu->cr0 = cr0; 1272 vcpu->arch.cr0 = cr0;
1200 1273
1201 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) 1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1202 vmx_fpu_activate(vcpu); 1275 vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1205static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1206{ 1279{
1207 vmcs_writel(GUEST_CR3, cr3); 1280 vmcs_writel(GUEST_CR3, cr3);
1208 if (vcpu->cr0 & X86_CR0_PE) 1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1209 vmx_fpu_deactivate(vcpu); 1282 vmx_fpu_deactivate(vcpu);
1210} 1283}
1211 1284
1212static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1213{ 1286{
1214 vmcs_writel(CR4_READ_SHADOW, cr4); 1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1215 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? 1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1216 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); 1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1217 vcpu->cr4 = cr4; 1290 vcpu->arch.cr4 = cr4;
1218} 1291}
1219 1292
1220#ifdef CONFIG_X86_64 1293#ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1224 struct vcpu_vmx *vmx = to_vmx(vcpu); 1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1225 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1226 1299
1227 vcpu->shadow_efer = efer; 1300 vcpu->arch.shadow_efer = efer;
1228 if (efer & EFER_LMA) { 1301 if (efer & EFER_LMA) {
1229 vmcs_write32(VM_ENTRY_CONTROLS, 1302 vmcs_write32(VM_ENTRY_CONTROLS,
1230 vmcs_read32(VM_ENTRY_CONTROLS) | 1303 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1301 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1302 u32 ar; 1375 u32 ar;
1303 1376
1304 if (vcpu->rmode.active && seg == VCPU_SREG_TR) { 1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1305 vcpu->rmode.tr.selector = var->selector; 1378 vcpu->arch.rmode.tr.selector = var->selector;
1306 vcpu->rmode.tr.base = var->base; 1379 vcpu->arch.rmode.tr.base = var->base;
1307 vcpu->rmode.tr.limit = var->limit; 1380 vcpu->arch.rmode.tr.limit = var->limit;
1308 vcpu->rmode.tr.ar = vmx_segment_access_rights(var); 1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1309 return; 1382 return;
1310 } 1383 }
1311 vmcs_writel(sf->base, var->base); 1384 vmcs_writel(sf->base, var->base);
1312 vmcs_write32(sf->limit, var->limit); 1385 vmcs_write32(sf->limit, var->limit);
1313 vmcs_write16(sf->selector, var->selector); 1386 vmcs_write16(sf->selector, var->selector);
1314 if (vcpu->rmode.active && var->s) { 1387 if (vcpu->arch.rmode.active && var->s) {
1315 /* 1388 /*
1316 * Hack real-mode segments into vm86 compatibility. 1389 * Hack real-mode segments into vm86 compatibility.
1317 */ 1390 */
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1355 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1356} 1429}
1357 1430
1358static int init_rmode_tss(struct kvm* kvm) 1431static int init_rmode_tss(struct kvm *kvm)
1359{ 1432{
1360 struct page *p1, *p2, *p3;
1361 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1362 char *page; 1434 u16 data = 0;
1363 1435 int ret = 0;
1364 p1 = gfn_to_page(kvm, fn++); 1436 int r;
1365 p2 = gfn_to_page(kvm, fn++);
1366 p3 = gfn_to_page(kvm, fn);
1367
1368 if (!p1 || !p2 || !p3) {
1369 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1370 return 0;
1371 }
1372
1373 page = kmap_atomic(p1, KM_USER0);
1374 clear_page(page);
1375 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1376 kunmap_atomic(page, KM_USER0);
1377
1378 page = kmap_atomic(p2, KM_USER0);
1379 clear_page(page);
1380 kunmap_atomic(page, KM_USER0);
1381 1437
1382 page = kmap_atomic(p3, KM_USER0); 1438 down_read(&current->mm->mmap_sem);
1383 clear_page(page); 1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1384 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; 1440 if (r < 0)
1385 kunmap_atomic(page, KM_USER0); 1441 goto out;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 goto out;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 goto out;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 goto out;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data,
1454 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1455 sizeof(u8));
1456 if (r < 0)
1457 goto out;
1386 1458
1387 return 1; 1459 ret = 1;
1460out:
1461 up_read(&current->mm->mmap_sem);
1462 return ret;
1388} 1463}
1389 1464
1390static void seg_setup(int seg) 1465static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
1397 vmcs_write32(sf->ar_bytes, 0x93); 1472 vmcs_write32(sf->ar_bytes, 0x93);
1398} 1473}
1399 1474
1475static int alloc_apic_access_page(struct kvm *kvm)
1476{
1477 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0;
1479
1480 down_write(&current->mm->mmap_sem);
1481 if (kvm->arch.apic_access_page)
1482 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1484 kvm_userspace_mem.flags = 0;
1485 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1486 kvm_userspace_mem.memory_size = PAGE_SIZE;
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r)
1489 goto out;
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1491out:
1492 up_write(&current->mm->mmap_sem);
1493 return r;
1494}
1495
1400/* 1496/*
1401 * Sets up the vmcs for emulated real mode. 1497 * Sets up the vmcs for emulated real mode.
1402 */ 1498 */
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1407 unsigned long a; 1503 unsigned long a;
1408 struct descriptor_table dt; 1504 struct descriptor_table dt;
1409 int i; 1505 int i;
1410 int ret = 0;
1411 unsigned long kvm_vmx_return; 1506 unsigned long kvm_vmx_return;
1412 u64 msr;
1413 u32 exec_control; 1507 u32 exec_control;
1414 1508
1415 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1416 ret = -ENOMEM;
1417 goto out;
1418 }
1419
1420 vmx->vcpu.rmode.active = 0;
1421
1422 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1423 set_cr8(&vmx->vcpu, 0);
1424 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1425 if (vmx->vcpu.vcpu_id == 0)
1426 msr |= MSR_IA32_APICBASE_BSP;
1427 kvm_set_apic_base(&vmx->vcpu, msr);
1428
1429 fx_init(&vmx->vcpu);
1430
1431 /*
1432 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1433 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1434 */
1435 if (vmx->vcpu.vcpu_id == 0) {
1436 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1437 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1438 } else {
1439 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1440 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1441 }
1442 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1443 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1444
1445 seg_setup(VCPU_SREG_DS);
1446 seg_setup(VCPU_SREG_ES);
1447 seg_setup(VCPU_SREG_FS);
1448 seg_setup(VCPU_SREG_GS);
1449 seg_setup(VCPU_SREG_SS);
1450
1451 vmcs_write16(GUEST_TR_SELECTOR, 0);
1452 vmcs_writel(GUEST_TR_BASE, 0);
1453 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1454 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1455
1456 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1457 vmcs_writel(GUEST_LDTR_BASE, 0);
1458 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1459 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1460
1461 vmcs_write32(GUEST_SYSENTER_CS, 0);
1462 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1463 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1464
1465 vmcs_writel(GUEST_RFLAGS, 0x02);
1466 if (vmx->vcpu.vcpu_id == 0)
1467 vmcs_writel(GUEST_RIP, 0xfff0);
1468 else
1469 vmcs_writel(GUEST_RIP, 0);
1470 vmcs_writel(GUEST_RSP, 0);
1471
1472 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1473 vmcs_writel(GUEST_DR7, 0x400);
1474
1475 vmcs_writel(GUEST_GDTR_BASE, 0);
1476 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1477
1478 vmcs_writel(GUEST_IDTR_BASE, 0);
1479 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1480
1481 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1482 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1483 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1484
1485 /* I/O */ 1509 /* I/O */
1486 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 1510 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1487 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 1511 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1488 1512
1489 guest_write_tsc(0);
1490
1491 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 1513 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1492 1514
1493 /* Special registers */
1494 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1495
1496 /* Control */ 1515 /* Control */
1497 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 1516 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1498 vmcs_config.pin_based_exec_ctrl); 1517 vmcs_config.pin_based_exec_ctrl);
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1507 } 1526 }
1508 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 1527 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1509 1528
1510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1529 if (cpu_has_secondary_exec_ctrls()) {
1511 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1530 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1531 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1532 exec_control &=
1533 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1534 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1535 }
1536
1537 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1512 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1539 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1513 1540
1514 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 1541 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1536 get_idt(&dt); 1563 get_idt(&dt);
1537 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1564 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1538 1565
1539 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1566 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1540 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 1567 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1541 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 1568 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1542 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 1569 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1567 ++vmx->nmsrs; 1594 ++vmx->nmsrs;
1568 } 1595 }
1569 1596
1570 setup_msrs(vmx);
1571
1572 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 1597 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1573 1598
1574 /* 22.2.1, 20.8.1 */ 1599 /* 22.2.1, 20.8.1 */
1575 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 1600 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1576 1601
1577 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1578
1579#ifdef CONFIG_X86_64
1580 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1581 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1582 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1583 page_to_phys(vmx->vcpu.apic->regs_page));
1584 vmcs_write32(TPR_THRESHOLD, 0);
1585#endif
1586
1587 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1588 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1589 1604
1590 vmx->vcpu.cr0 = 0x60000010; 1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1591 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode 1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1592 vmx_set_cr4(&vmx->vcpu, 0); 1607 return -ENOMEM;
1593#ifdef CONFIG_X86_64
1594 vmx_set_efer(&vmx->vcpu, 0);
1595#endif
1596 vmx_fpu_activate(&vmx->vcpu);
1597 update_exception_bitmap(&vmx->vcpu);
1598 1608
1599 return 0; 1609 return 0;
1600
1601out:
1602 return ret;
1603} 1610}
1604 1611
1605static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 1612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{ 1613{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu); 1614 struct vcpu_vmx *vmx = to_vmx(vcpu);
1615 u64 msr;
1616 int ret;
1608 1617
1609 vmx_vcpu_setup(vmx); 1618 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1610} 1619 ret = -ENOMEM;
1611 1620 goto out;
1612static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1613{
1614 u16 ent[2];
1615 u16 cs;
1616 u16 ip;
1617 unsigned long flags;
1618 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1619 u16 sp = vmcs_readl(GUEST_RSP);
1620 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1621
1622 if (sp > ss_limit || sp < 6 ) {
1623 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1624 __FUNCTION__,
1625 vmcs_readl(GUEST_RSP),
1626 vmcs_readl(GUEST_SS_BASE),
1627 vmcs_read32(GUEST_SS_LIMIT));
1628 return;
1629 } 1621 }
1630 1622
1631 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != 1623 vmx->vcpu.arch.rmode.active = 0;
1632 X86EMUL_CONTINUE) { 1624
1633 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); 1625 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1634 return; 1626 set_cr8(&vmx->vcpu, 0);
1627 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1628 if (vmx->vcpu.vcpu_id == 0)
1629 msr |= MSR_IA32_APICBASE_BSP;
1630 kvm_set_apic_base(&vmx->vcpu, msr);
1631
1632 fx_init(&vmx->vcpu);
1633
1634 /*
1635 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1636 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1637 */
1638 if (vmx->vcpu.vcpu_id == 0) {
1639 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1640 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1641 } else {
1642 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1643 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1635 } 1644 }
1645 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1646 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1647
1648 seg_setup(VCPU_SREG_DS);
1649 seg_setup(VCPU_SREG_ES);
1650 seg_setup(VCPU_SREG_FS);
1651 seg_setup(VCPU_SREG_GS);
1652 seg_setup(VCPU_SREG_SS);
1653
1654 vmcs_write16(GUEST_TR_SELECTOR, 0);
1655 vmcs_writel(GUEST_TR_BASE, 0);
1656 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1636 1658
1637 flags = vmcs_readl(GUEST_RFLAGS); 1659 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1638 cs = vmcs_readl(GUEST_CS_BASE) >> 4; 1660 vmcs_writel(GUEST_LDTR_BASE, 0);
1639 ip = vmcs_readl(GUEST_RIP); 1661 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1662 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1640 1663
1664 vmcs_write32(GUEST_SYSENTER_CS, 0);
1665 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1666 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1641 1667
1642 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || 1668 vmcs_writel(GUEST_RFLAGS, 0x02);
1643 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || 1669 if (vmx->vcpu.vcpu_id == 0)
1644 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { 1670 vmcs_writel(GUEST_RIP, 0xfff0);
1645 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); 1671 else
1646 return; 1672 vmcs_writel(GUEST_RIP, 0);
1673 vmcs_writel(GUEST_RSP, 0);
1674
1675 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1676 vmcs_writel(GUEST_DR7, 0x400);
1677
1678 vmcs_writel(GUEST_GDTR_BASE, 0);
1679 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1680
1681 vmcs_writel(GUEST_IDTR_BASE, 0);
1682 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1683
1684 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1686 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1687
1688 guest_write_tsc(0);
1689
1690 /* Special registers */
1691 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1692
1693 setup_msrs(vmx);
1694
1695 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1696
1697 if (cpu_has_vmx_tpr_shadow()) {
1698 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1699 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1700 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1701 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1702 vmcs_write32(TPR_THRESHOLD, 0);
1647 } 1703 }
1648 1704
1649 vmcs_writel(GUEST_RFLAGS, flags & 1705 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1650 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); 1706 vmcs_write64(APIC_ACCESS_ADDR,
1651 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; 1707 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1652 vmcs_writel(GUEST_CS_BASE, ent[1] << 4); 1708
1653 vmcs_writel(GUEST_RIP, ent[0]); 1709 vmx->vcpu.arch.cr0 = 0x60000010;
1654 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); 1710 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1711 vmx_set_cr4(&vmx->vcpu, 0);
1712#ifdef CONFIG_X86_64
1713 vmx_set_efer(&vmx->vcpu, 0);
1714#endif
1715 vmx_fpu_activate(&vmx->vcpu);
1716 update_exception_bitmap(&vmx->vcpu);
1717
1718 return 0;
1719
1720out:
1721 return ret;
1655} 1722}
1656 1723
1657static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 1724static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1658{ 1725{
1659 if (vcpu->rmode.active) { 1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1660 inject_rmode_irq(vcpu, irq); 1727
1728 if (vcpu->arch.rmode.active) {
1729 vmx->rmode.irq.pending = true;
1730 vmx->rmode.irq.vector = irq;
1731 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1732 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1733 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1734 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1735 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1661 return; 1736 return;
1662 } 1737 }
1663 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 1738 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1666 1741
1667static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1742static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1668{ 1743{
1669 int word_index = __ffs(vcpu->irq_summary); 1744 int word_index = __ffs(vcpu->arch.irq_summary);
1670 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1745 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1671 int irq = word_index * BITS_PER_LONG + bit_index; 1746 int irq = word_index * BITS_PER_LONG + bit_index;
1672 1747
1673 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1748 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1674 if (!vcpu->irq_pending[word_index]) 1749 if (!vcpu->arch.irq_pending[word_index])
1675 clear_bit(word_index, &vcpu->irq_summary); 1750 clear_bit(word_index, &vcpu->arch.irq_summary);
1676 vmx_inject_irq(vcpu, irq); 1751 vmx_inject_irq(vcpu, irq);
1677} 1752}
1678 1753
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1682{ 1757{
1683 u32 cpu_based_vm_exec_control; 1758 u32 cpu_based_vm_exec_control;
1684 1759
1685 vcpu->interrupt_window_open = 1760 vcpu->arch.interrupt_window_open =
1686 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 1761 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1687 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 1762 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1688 1763
1689 if (vcpu->interrupt_window_open && 1764 if (vcpu->arch.interrupt_window_open &&
1690 vcpu->irq_summary && 1765 vcpu->arch.irq_summary &&
1691 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) 1766 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1692 /* 1767 /*
1693 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1768 * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1695 kvm_do_inject_irq(vcpu); 1770 kvm_do_inject_irq(vcpu);
1696 1771
1697 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 1772 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1698 if (!vcpu->interrupt_window_open && 1773 if (!vcpu->arch.interrupt_window_open &&
1699 (vcpu->irq_summary || kvm_run->request_interrupt_window)) 1774 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1700 /* 1775 /*
1701 * Interrupts blocked. Wait for unblock. 1776 * Interrupts blocked. Wait for unblock.
1702 */ 1777 */
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 1781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1707} 1782}
1708 1783
1784static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1785{
1786 int ret;
1787 struct kvm_userspace_memory_region tss_mem = {
1788 .slot = 8,
1789 .guest_phys_addr = addr,
1790 .memory_size = PAGE_SIZE * 3,
1791 .flags = 0,
1792 };
1793
1794 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1795 if (ret)
1796 return ret;
1797 kvm->arch.tss_addr = addr;
1798 return 0;
1799}
1800
1709static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) 1801static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1710{ 1802{
1711 struct kvm_guest_debug *dbg = &vcpu->guest_debug; 1803 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1727static int handle_rmode_exception(struct kvm_vcpu *vcpu, 1819static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1728 int vec, u32 err_code) 1820 int vec, u32 err_code)
1729{ 1821{
1730 if (!vcpu->rmode.active) 1822 if (!vcpu->arch.rmode.active)
1731 return 0; 1823 return 0;
1732 1824
1733 /* 1825 /*
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1735 * Cause the #SS fault with 0 error code in VM86 mode. 1827 * Cause the #SS fault with 0 error code in VM86 mode.
1736 */ 1828 */
1737 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 1829 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1738 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) 1830 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1739 return 1; 1831 return 1;
1740 return 0; 1832 return 0;
1741} 1833}
1742 1834
1743static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1835static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1744{ 1836{
1837 struct vcpu_vmx *vmx = to_vmx(vcpu);
1745 u32 intr_info, error_code; 1838 u32 intr_info, error_code;
1746 unsigned long cr2, rip; 1839 unsigned long cr2, rip;
1747 u32 vect_info; 1840 u32 vect_info;
1748 enum emulation_result er; 1841 enum emulation_result er;
1749 int r;
1750 1842
1751 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1843 vect_info = vmx->idt_vectoring_info;
1752 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 1844 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1753 1845
1754 if ((vect_info & VECTORING_INFO_VALID_MASK) && 1846 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1755 !is_page_fault(intr_info)) { 1847 !is_page_fault(intr_info))
1756 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 1848 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1757 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1849 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1758 }
1759 1850
1760 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { 1851 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1761 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1852 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1762 set_bit(irq, vcpu->irq_pending); 1853 set_bit(irq, vcpu->arch.irq_pending);
1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1854 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1764 } 1855 }
1765 1856
1766 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 1857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 return 1; 1862 return 1;
1772 } 1863 }
1773 1864
1865 if (is_invalid_opcode(intr_info)) {
1866 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1867 if (er != EMULATE_DONE)
1868 kvm_queue_exception(vcpu, UD_VECTOR);
1869 return 1;
1870 }
1871
1774 error_code = 0; 1872 error_code = 0;
1775 rip = vmcs_readl(GUEST_RIP); 1873 rip = vmcs_readl(GUEST_RIP);
1776 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1874 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 1875 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1778 if (is_page_fault(intr_info)) { 1876 if (is_page_fault(intr_info)) {
1779 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1877 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1780 1878 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1781 mutex_lock(&vcpu->kvm->lock);
1782 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1783 if (r < 0) {
1784 mutex_unlock(&vcpu->kvm->lock);
1785 return r;
1786 }
1787 if (!r) {
1788 mutex_unlock(&vcpu->kvm->lock);
1789 return 1;
1790 }
1791
1792 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1793 mutex_unlock(&vcpu->kvm->lock);
1794
1795 switch (er) {
1796 case EMULATE_DONE:
1797 return 1;
1798 case EMULATE_DO_MMIO:
1799 ++vcpu->stat.mmio_exits;
1800 return 0;
1801 case EMULATE_FAIL:
1802 kvm_report_emulation_failure(vcpu, "pagetable");
1803 break;
1804 default:
1805 BUG();
1806 }
1807 } 1879 }
1808 1880
1809 if (vcpu->rmode.active && 1881 if (vcpu->arch.rmode.active &&
1810 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 1882 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1811 error_code)) { 1883 error_code)) {
1812 if (vcpu->halt_request) { 1884 if (vcpu->arch.halt_request) {
1813 vcpu->halt_request = 0; 1885 vcpu->arch.halt_request = 0;
1814 return kvm_emulate_halt(vcpu); 1886 return kvm_emulate_halt(vcpu);
1815 } 1887 }
1816 return 1; 1888 return 1;
1817 } 1889 }
1818 1890
1819 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { 1891 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1892 (INTR_TYPE_EXCEPTION | 1)) {
1820 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1893 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1821 return 0; 1894 return 0;
1822 } 1895 }
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1850 string = (exit_qualification & 16) != 0; 1923 string = (exit_qualification & 16) != 0;
1851 1924
1852 if (string) { 1925 if (string) {
1853 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 1926 if (emulate_instruction(vcpu,
1927 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1854 return 0; 1928 return 0;
1855 return 1; 1929 return 1;
1856 } 1930 }
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1873 hypercall[0] = 0x0f; 1947 hypercall[0] = 0x0f;
1874 hypercall[1] = 0x01; 1948 hypercall[1] = 0x01;
1875 hypercall[2] = 0xc1; 1949 hypercall[2] = 0xc1;
1876 hypercall[3] = 0xc3;
1877} 1950}
1878 1951
1879static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1952static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1890 switch (cr) { 1963 switch (cr) {
1891 case 0: 1964 case 0:
1892 vcpu_load_rsp_rip(vcpu); 1965 vcpu_load_rsp_rip(vcpu);
1893 set_cr0(vcpu, vcpu->regs[reg]); 1966 set_cr0(vcpu, vcpu->arch.regs[reg]);
1894 skip_emulated_instruction(vcpu); 1967 skip_emulated_instruction(vcpu);
1895 return 1; 1968 return 1;
1896 case 3: 1969 case 3:
1897 vcpu_load_rsp_rip(vcpu); 1970 vcpu_load_rsp_rip(vcpu);
1898 set_cr3(vcpu, vcpu->regs[reg]); 1971 set_cr3(vcpu, vcpu->arch.regs[reg]);
1899 skip_emulated_instruction(vcpu); 1972 skip_emulated_instruction(vcpu);
1900 return 1; 1973 return 1;
1901 case 4: 1974 case 4:
1902 vcpu_load_rsp_rip(vcpu); 1975 vcpu_load_rsp_rip(vcpu);
1903 set_cr4(vcpu, vcpu->regs[reg]); 1976 set_cr4(vcpu, vcpu->arch.regs[reg]);
1904 skip_emulated_instruction(vcpu); 1977 skip_emulated_instruction(vcpu);
1905 return 1; 1978 return 1;
1906 case 8: 1979 case 8:
1907 vcpu_load_rsp_rip(vcpu); 1980 vcpu_load_rsp_rip(vcpu);
1908 set_cr8(vcpu, vcpu->regs[reg]); 1981 set_cr8(vcpu, vcpu->arch.regs[reg]);
1909 skip_emulated_instruction(vcpu); 1982 skip_emulated_instruction(vcpu);
1983 if (irqchip_in_kernel(vcpu->kvm))
1984 return 1;
1910 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1985 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1911 return 0; 1986 return 0;
1912 }; 1987 };
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1914 case 2: /* clts */ 1989 case 2: /* clts */
1915 vcpu_load_rsp_rip(vcpu); 1990 vcpu_load_rsp_rip(vcpu);
1916 vmx_fpu_deactivate(vcpu); 1991 vmx_fpu_deactivate(vcpu);
1917 vcpu->cr0 &= ~X86_CR0_TS; 1992 vcpu->arch.cr0 &= ~X86_CR0_TS;
1918 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1993 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1919 vmx_fpu_activate(vcpu); 1994 vmx_fpu_activate(vcpu);
1920 skip_emulated_instruction(vcpu); 1995 skip_emulated_instruction(vcpu);
1921 return 1; 1996 return 1;
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1923 switch (cr) { 1998 switch (cr) {
1924 case 3: 1999 case 3:
1925 vcpu_load_rsp_rip(vcpu); 2000 vcpu_load_rsp_rip(vcpu);
1926 vcpu->regs[reg] = vcpu->cr3; 2001 vcpu->arch.regs[reg] = vcpu->arch.cr3;
1927 vcpu_put_rsp_rip(vcpu); 2002 vcpu_put_rsp_rip(vcpu);
1928 skip_emulated_instruction(vcpu); 2003 skip_emulated_instruction(vcpu);
1929 return 1; 2004 return 1;
1930 case 8: 2005 case 8:
1931 vcpu_load_rsp_rip(vcpu); 2006 vcpu_load_rsp_rip(vcpu);
1932 vcpu->regs[reg] = get_cr8(vcpu); 2007 vcpu->arch.regs[reg] = get_cr8(vcpu);
1933 vcpu_put_rsp_rip(vcpu); 2008 vcpu_put_rsp_rip(vcpu);
1934 skip_emulated_instruction(vcpu); 2009 skip_emulated_instruction(vcpu);
1935 return 1; 2010 return 1;
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1975 default: 2050 default:
1976 val = 0; 2051 val = 0;
1977 } 2052 }
1978 vcpu->regs[reg] = val; 2053 vcpu->arch.regs[reg] = val;
1979 } else { 2054 } else {
1980 /* mov to dr */ 2055 /* mov to dr */
1981 } 2056 }
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1992 2067
1993static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2068static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1994{ 2069{
1995 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2070 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
1996 u64 data; 2071 u64 data;
1997 2072
1998 if (vmx_get_msr(vcpu, ecx, &data)) { 2073 if (vmx_get_msr(vcpu, ecx, &data)) {
1999 vmx_inject_gp(vcpu, 0); 2074 kvm_inject_gp(vcpu, 0);
2000 return 1; 2075 return 1;
2001 } 2076 }
2002 2077
2003 /* FIXME: handling of bits 32:63 of rax, rdx */ 2078 /* FIXME: handling of bits 32:63 of rax, rdx */
2004 vcpu->regs[VCPU_REGS_RAX] = data & -1u; 2079 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2005 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 2080 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2006 skip_emulated_instruction(vcpu); 2081 skip_emulated_instruction(vcpu);
2007 return 1; 2082 return 1;
2008} 2083}
2009 2084
2010static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2085static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2011{ 2086{
2012 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2087 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2013 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) 2088 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2014 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); 2089 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2015 2090
2016 if (vmx_set_msr(vcpu, ecx, data) != 0) { 2091 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2017 vmx_inject_gp(vcpu, 0); 2092 kvm_inject_gp(vcpu, 0);
2018 return 1; 2093 return 1;
2019 } 2094 }
2020 2095
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2042 * possible 2117 * possible
2043 */ 2118 */
2044 if (kvm_run->request_interrupt_window && 2119 if (kvm_run->request_interrupt_window &&
2045 !vcpu->irq_summary) { 2120 !vcpu->arch.irq_summary) {
2046 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2121 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2047 ++vcpu->stat.irq_window_exits; 2122 ++vcpu->stat.irq_window_exits;
2048 return 0; 2123 return 0;
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2059static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2134static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2060{ 2135{
2061 skip_emulated_instruction(vcpu); 2136 skip_emulated_instruction(vcpu);
2062 return kvm_hypercall(vcpu, kvm_run); 2137 kvm_emulate_hypercall(vcpu);
2138 return 1;
2139}
2140
2141static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 skip_emulated_instruction(vcpu);
2144 /* TODO: Add support for VT-d/pass-through device */
2145 return 1;
2146}
2147
2148static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2149{
2150 u64 exit_qualification;
2151 enum emulation_result er;
2152 unsigned long offset;
2153
2154 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2155 offset = exit_qualification & 0xffful;
2156
2157 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2158
2159 if (er != EMULATE_DONE) {
2160 printk(KERN_ERR
2161 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2162 offset);
2163 return -ENOTSUPP;
2164 }
2165 return 1;
2063} 2166}
2064 2167
2065/* 2168/*
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2081 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2184 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2082 [EXIT_REASON_HLT] = handle_halt, 2185 [EXIT_REASON_HLT] = handle_halt,
2083 [EXIT_REASON_VMCALL] = handle_vmcall, 2186 [EXIT_REASON_VMCALL] = handle_vmcall,
2084 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold 2187 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2188 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2189 [EXIT_REASON_WBINVD] = handle_wbinvd,
2085}; 2190};
2086 2191
2087static const int kvm_vmx_max_exit_handlers = 2192static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
2093 */ 2198 */
2094static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2199static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2095{ 2200{
2096 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2097 u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 2201 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2098 struct vcpu_vmx *vmx = to_vmx(vcpu); 2202 struct vcpu_vmx *vmx = to_vmx(vcpu);
2203 u32 vectoring_info = vmx->idt_vectoring_info;
2099 2204
2100 if (unlikely(vmx->fail)) { 2205 if (unlikely(vmx->fail)) {
2101 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2206 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2104 return 0; 2209 return 0;
2105 } 2210 }
2106 2211
2107 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && 2212 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2108 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 2213 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2109 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 2214 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2110 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 2215 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2111 if (exit_reason < kvm_vmx_max_exit_handlers 2216 if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2150 2255
2151static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2256static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2152{ 2257{
2258 struct vcpu_vmx *vmx = to_vmx(vcpu);
2153 u32 idtv_info_field, intr_info_field; 2259 u32 idtv_info_field, intr_info_field;
2154 int has_ext_irq, interrupt_window_open; 2260 int has_ext_irq, interrupt_window_open;
2155 int vector; 2261 int vector;
2156 2262
2157 kvm_inject_pending_timer_irqs(vcpu);
2158 update_tpr_threshold(vcpu); 2263 update_tpr_threshold(vcpu);
2159 2264
2160 has_ext_irq = kvm_cpu_has_interrupt(vcpu); 2265 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2161 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2266 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2162 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2267 idtv_info_field = vmx->idt_vectoring_info;
2163 if (intr_info_field & INTR_INFO_VALID_MASK) { 2268 if (intr_info_field & INTR_INFO_VALID_MASK) {
2164 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2269 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2165 /* TODO: fault when IDT_Vectoring */ 2270 /* TODO: fault when IDT_Vectoring */
2166 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2271 if (printk_ratelimit())
2272 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2167 } 2273 }
2168 if (has_ext_irq) 2274 if (has_ext_irq)
2169 enable_irq_window(vcpu); 2275 enable_irq_window(vcpu);
2170 return; 2276 return;
2171 } 2277 }
2172 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2278 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2279 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2280 == INTR_TYPE_EXT_INTR
2281 && vcpu->arch.rmode.active) {
2282 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2283
2284 vmx_inject_irq(vcpu, vect);
2285 if (unlikely(has_ext_irq))
2286 enable_irq_window(vcpu);
2287 return;
2288 }
2289
2173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2290 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2174 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2291 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2175 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2194 enable_irq_window(vcpu); 2311 enable_irq_window(vcpu);
2195} 2312}
2196 2313
2314/*
2315 * Failure to inject an interrupt should give us the information
2316 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2317 * when fetching the interrupt redirection bitmap in the real-mode
2318 * tss, this doesn't happen. So we do it ourselves.
2319 */
2320static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2321{
2322 vmx->rmode.irq.pending = 0;
2323 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2324 return;
2325 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2326 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2327 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2328 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2329 return;
2330 }
2331 vmx->idt_vectoring_info =
2332 VECTORING_INFO_VALID_MASK
2333 | INTR_TYPE_EXT_INTR
2334 | vmx->rmode.irq.vector;
2335}
2336
2197static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2337static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2198{ 2338{
2199 struct vcpu_vmx *vmx = to_vmx(vcpu); 2339 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2204 */ 2344 */
2205 vmcs_writel(HOST_CR0, read_cr0()); 2345 vmcs_writel(HOST_CR0, read_cr0());
2206 2346
2207 asm ( 2347 asm(
2208 /* Store host registers */ 2348 /* Store host registers */
2209#ifdef CONFIG_X86_64 2349#ifdef CONFIG_X86_64
2210 "push %%rax; push %%rbx; push %%rdx;" 2350 "push %%rdx; push %%rbp;"
2211 "push %%rsi; push %%rdi; push %%rbp;"
2212 "push %%r8; push %%r9; push %%r10; push %%r11;"
2213 "push %%r12; push %%r13; push %%r14; push %%r15;"
2214 "push %%rcx \n\t" 2351 "push %%rcx \n\t"
2215 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2216#else 2352#else
2217 "pusha; push %%ecx \n\t" 2353 "push %%edx; push %%ebp;"
2218 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2354 "push %%ecx \n\t"
2219#endif 2355#endif
2356 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2220 /* Check if vmlaunch of vmresume is needed */ 2357 /* Check if vmlaunch of vmresume is needed */
2221 "cmp $0, %1 \n\t" 2358 "cmpl $0, %c[launched](%0) \n\t"
2222 /* Load guest registers. Don't clobber flags. */ 2359 /* Load guest registers. Don't clobber flags. */
2223#ifdef CONFIG_X86_64 2360#ifdef CONFIG_X86_64
2224 "mov %c[cr2](%3), %%rax \n\t" 2361 "mov %c[cr2](%0), %%rax \n\t"
2225 "mov %%rax, %%cr2 \n\t" 2362 "mov %%rax, %%cr2 \n\t"
2226 "mov %c[rax](%3), %%rax \n\t" 2363 "mov %c[rax](%0), %%rax \n\t"
2227 "mov %c[rbx](%3), %%rbx \n\t" 2364 "mov %c[rbx](%0), %%rbx \n\t"
2228 "mov %c[rdx](%3), %%rdx \n\t" 2365 "mov %c[rdx](%0), %%rdx \n\t"
2229 "mov %c[rsi](%3), %%rsi \n\t" 2366 "mov %c[rsi](%0), %%rsi \n\t"
2230 "mov %c[rdi](%3), %%rdi \n\t" 2367 "mov %c[rdi](%0), %%rdi \n\t"
2231 "mov %c[rbp](%3), %%rbp \n\t" 2368 "mov %c[rbp](%0), %%rbp \n\t"
2232 "mov %c[r8](%3), %%r8 \n\t" 2369 "mov %c[r8](%0), %%r8 \n\t"
2233 "mov %c[r9](%3), %%r9 \n\t" 2370 "mov %c[r9](%0), %%r9 \n\t"
2234 "mov %c[r10](%3), %%r10 \n\t" 2371 "mov %c[r10](%0), %%r10 \n\t"
2235 "mov %c[r11](%3), %%r11 \n\t" 2372 "mov %c[r11](%0), %%r11 \n\t"
2236 "mov %c[r12](%3), %%r12 \n\t" 2373 "mov %c[r12](%0), %%r12 \n\t"
2237 "mov %c[r13](%3), %%r13 \n\t" 2374 "mov %c[r13](%0), %%r13 \n\t"
2238 "mov %c[r14](%3), %%r14 \n\t" 2375 "mov %c[r14](%0), %%r14 \n\t"
2239 "mov %c[r15](%3), %%r15 \n\t" 2376 "mov %c[r15](%0), %%r15 \n\t"
2240 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ 2377 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2241#else 2378#else
2242 "mov %c[cr2](%3), %%eax \n\t" 2379 "mov %c[cr2](%0), %%eax \n\t"
2243 "mov %%eax, %%cr2 \n\t" 2380 "mov %%eax, %%cr2 \n\t"
2244 "mov %c[rax](%3), %%eax \n\t" 2381 "mov %c[rax](%0), %%eax \n\t"
2245 "mov %c[rbx](%3), %%ebx \n\t" 2382 "mov %c[rbx](%0), %%ebx \n\t"
2246 "mov %c[rdx](%3), %%edx \n\t" 2383 "mov %c[rdx](%0), %%edx \n\t"
2247 "mov %c[rsi](%3), %%esi \n\t" 2384 "mov %c[rsi](%0), %%esi \n\t"
2248 "mov %c[rdi](%3), %%edi \n\t" 2385 "mov %c[rdi](%0), %%edi \n\t"
2249 "mov %c[rbp](%3), %%ebp \n\t" 2386 "mov %c[rbp](%0), %%ebp \n\t"
2250 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ 2387 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2251#endif 2388#endif
2252 /* Enter guest mode */ 2389 /* Enter guest mode */
2253 "jne .Llaunched \n\t" 2390 "jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2257 ".Lkvm_vmx_return: " 2394 ".Lkvm_vmx_return: "
2258 /* Save guest registers, load host registers, keep flags */ 2395 /* Save guest registers, load host registers, keep flags */
2259#ifdef CONFIG_X86_64 2396#ifdef CONFIG_X86_64
2260 "xchg %3, (%%rsp) \n\t" 2397 "xchg %0, (%%rsp) \n\t"
2261 "mov %%rax, %c[rax](%3) \n\t" 2398 "mov %%rax, %c[rax](%0) \n\t"
2262 "mov %%rbx, %c[rbx](%3) \n\t" 2399 "mov %%rbx, %c[rbx](%0) \n\t"
2263 "pushq (%%rsp); popq %c[rcx](%3) \n\t" 2400 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2264 "mov %%rdx, %c[rdx](%3) \n\t" 2401 "mov %%rdx, %c[rdx](%0) \n\t"
2265 "mov %%rsi, %c[rsi](%3) \n\t" 2402 "mov %%rsi, %c[rsi](%0) \n\t"
2266 "mov %%rdi, %c[rdi](%3) \n\t" 2403 "mov %%rdi, %c[rdi](%0) \n\t"
2267 "mov %%rbp, %c[rbp](%3) \n\t" 2404 "mov %%rbp, %c[rbp](%0) \n\t"
2268 "mov %%r8, %c[r8](%3) \n\t" 2405 "mov %%r8, %c[r8](%0) \n\t"
2269 "mov %%r9, %c[r9](%3) \n\t" 2406 "mov %%r9, %c[r9](%0) \n\t"
2270 "mov %%r10, %c[r10](%3) \n\t" 2407 "mov %%r10, %c[r10](%0) \n\t"
2271 "mov %%r11, %c[r11](%3) \n\t" 2408 "mov %%r11, %c[r11](%0) \n\t"
2272 "mov %%r12, %c[r12](%3) \n\t" 2409 "mov %%r12, %c[r12](%0) \n\t"
2273 "mov %%r13, %c[r13](%3) \n\t" 2410 "mov %%r13, %c[r13](%0) \n\t"
2274 "mov %%r14, %c[r14](%3) \n\t" 2411 "mov %%r14, %c[r14](%0) \n\t"
2275 "mov %%r15, %c[r15](%3) \n\t" 2412 "mov %%r15, %c[r15](%0) \n\t"
2276 "mov %%cr2, %%rax \n\t" 2413 "mov %%cr2, %%rax \n\t"
2277 "mov %%rax, %c[cr2](%3) \n\t" 2414 "mov %%rax, %c[cr2](%0) \n\t"
2278 "mov (%%rsp), %3 \n\t"
2279 2415
2280 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 2416 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2281 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
2282 "pop %%rbp; pop %%rdi; pop %%rsi;"
2283 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
2284#else 2417#else
2285 "xchg %3, (%%esp) \n\t" 2418 "xchg %0, (%%esp) \n\t"
2286 "mov %%eax, %c[rax](%3) \n\t" 2419 "mov %%eax, %c[rax](%0) \n\t"
2287 "mov %%ebx, %c[rbx](%3) \n\t" 2420 "mov %%ebx, %c[rbx](%0) \n\t"
2288 "pushl (%%esp); popl %c[rcx](%3) \n\t" 2421 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2289 "mov %%edx, %c[rdx](%3) \n\t" 2422 "mov %%edx, %c[rdx](%0) \n\t"
2290 "mov %%esi, %c[rsi](%3) \n\t" 2423 "mov %%esi, %c[rsi](%0) \n\t"
2291 "mov %%edi, %c[rdi](%3) \n\t" 2424 "mov %%edi, %c[rdi](%0) \n\t"
2292 "mov %%ebp, %c[rbp](%3) \n\t" 2425 "mov %%ebp, %c[rbp](%0) \n\t"
2293 "mov %%cr2, %%eax \n\t" 2426 "mov %%cr2, %%eax \n\t"
2294 "mov %%eax, %c[cr2](%3) \n\t" 2427 "mov %%eax, %c[cr2](%0) \n\t"
2295 "mov (%%esp), %3 \n\t"
2296 2428
2297 "pop %%ecx; popa \n\t" 2429 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2430#endif
2431 "setbe %c[fail](%0) \n\t"
2432 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2433 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2434 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2435 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2436 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2437 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2438 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2439 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2440 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2441 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2442#ifdef CONFIG_X86_64
2443 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2444 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2445 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2446 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2447 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2448 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2449 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2450 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2298#endif 2451#endif
2299 "setbe %0 \n\t" 2452 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2300 : "=q" (vmx->fail) 2453 : "cc", "memory"
2301 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2302 "c"(vcpu),
2303 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2304 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
2305 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
2306 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
2307 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
2308 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
2309 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
2310#ifdef CONFIG_X86_64 2454#ifdef CONFIG_X86_64
2311 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), 2455 , "rbx", "rdi", "rsi"
2312 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), 2456 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2313 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), 2457#else
2314 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), 2458 , "ebx", "edi", "rsi"
2315 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
2316 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
2317 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
2318 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
2319#endif 2459#endif
2320 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 2460 );
2321 : "cc", "memory" ); 2461
2462 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2463 if (vmx->rmode.irq.pending)
2464 fixup_rmode_irq(vmx);
2322 2465
2323 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2466 vcpu->arch.interrupt_window_open =
2467 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2324 2468
2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2469 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2326 vmx->launched = 1; 2470 vmx->launched = 1;
2327 2471
2328 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2472 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2332 asm("int $2"); 2476 asm("int $2");
2333} 2477}
2334 2478
2335static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2336 unsigned long addr,
2337 u32 err_code)
2338{
2339 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2340
2341 ++vcpu->stat.pf_guest;
2342
2343 if (is_page_fault(vect_info)) {
2344 printk(KERN_DEBUG "inject_page_fault: "
2345 "double fault 0x%lx @ 0x%lx\n",
2346 addr, vmcs_readl(GUEST_RIP));
2347 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2349 DF_VECTOR |
2350 INTR_TYPE_EXCEPTION |
2351 INTR_INFO_DELIEVER_CODE_MASK |
2352 INTR_INFO_VALID_MASK);
2353 return;
2354 }
2355 vcpu->cr2 = addr;
2356 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358 PF_VECTOR |
2359 INTR_TYPE_EXCEPTION |
2360 INTR_INFO_DELIEVER_CODE_MASK |
2361 INTR_INFO_VALID_MASK);
2362
2363}
2364
2365static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2479static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2366{ 2480{
2367 struct vcpu_vmx *vmx = to_vmx(vcpu); 2481 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2397 if (err) 2511 if (err)
2398 goto free_vcpu; 2512 goto free_vcpu;
2399 2513
2400 if (irqchip_in_kernel(kvm)) {
2401 err = kvm_create_lapic(&vmx->vcpu);
2402 if (err < 0)
2403 goto free_vcpu;
2404 }
2405
2406 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2514 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2407 if (!vmx->guest_msrs) { 2515 if (!vmx->guest_msrs) {
2408 err = -ENOMEM; 2516 err = -ENOMEM;
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
2464 .check_processor_compatibility = vmx_check_processor_compat, 2572 .check_processor_compatibility = vmx_check_processor_compat,
2465 .hardware_enable = hardware_enable, 2573 .hardware_enable = hardware_enable,
2466 .hardware_disable = hardware_disable, 2574 .hardware_disable = hardware_disable,
2575 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
2467 2576
2468 .vcpu_create = vmx_create_vcpu, 2577 .vcpu_create = vmx_create_vcpu,
2469 .vcpu_free = vmx_free_vcpu, 2578 .vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
2499 .set_rflags = vmx_set_rflags, 2608 .set_rflags = vmx_set_rflags,
2500 2609
2501 .tlb_flush = vmx_flush_tlb, 2610 .tlb_flush = vmx_flush_tlb,
2502 .inject_page_fault = vmx_inject_page_fault,
2503
2504 .inject_gp = vmx_inject_gp,
2505 2611
2506 .run = vmx_vcpu_run, 2612 .run = vmx_vcpu_run,
2507 .handle_exit = kvm_handle_exit, 2613 .handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
2509 .patch_hypercall = vmx_patch_hypercall, 2615 .patch_hypercall = vmx_patch_hypercall,
2510 .get_irq = vmx_get_irq, 2616 .get_irq = vmx_get_irq,
2511 .set_irq = vmx_inject_irq, 2617 .set_irq = vmx_inject_irq,
2618 .queue_exception = vmx_queue_exception,
2619 .exception_injected = vmx_exception_injected,
2512 .inject_pending_irq = vmx_intr_assist, 2620 .inject_pending_irq = vmx_intr_assist,
2513 .inject_pending_vectors = do_interrupt_requests, 2621 .inject_pending_vectors = do_interrupt_requests,
2622
2623 .set_tss_addr = vmx_set_tss_addr,
2514}; 2624};
2515 2625
2516static int __init vmx_init(void) 2626static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
2541 memset(iova, 0xff, PAGE_SIZE); 2651 memset(iova, 0xff, PAGE_SIZE);
2542 kunmap(vmx_io_bitmap_b); 2652 kunmap(vmx_io_bitmap_b);
2543 2653
2544 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 2654 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2545 if (r) 2655 if (r)
2546 goto out1; 2656 goto out1;
2547 2657
2658 if (bypass_guest_pf)
2659 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2660
2548 return 0; 2661 return 0;
2549 2662
2550out1: 2663out1:
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
2559 __free_page(vmx_io_bitmap_b); 2672 __free_page(vmx_io_bitmap_b);
2560 __free_page(vmx_io_bitmap_a); 2673 __free_page(vmx_io_bitmap_a);
2561 2674
2562 kvm_exit_x86(); 2675 kvm_exit();
2563} 2676}
2564 2677
2565module_init(vmx_init) 2678module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h
index fd4e14666088..d52ae8d7303d 100644
--- a/drivers/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -25,6 +25,9 @@
25 * 25 *
26 */ 26 */
27 27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080 33#define CPU_BASED_HLT_EXITING 0x00000080
@@ -42,6 +45,12 @@
42#define CPU_BASED_MONITOR_EXITING 0x20000000 45#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000 46#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
45 54
46#define PIN_BASED_EXT_INTR_MASK 0x00000001 55#define PIN_BASED_EXT_INTR_MASK 0x00000001
47#define PIN_BASED_NMI_EXITING 0x00000008 56#define PIN_BASED_NMI_EXITING 0x00000008
@@ -54,8 +63,6 @@
54#define VM_ENTRY_SMM 0x00000400 63#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56 65
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58
59/* VMCS Encodings */ 66/* VMCS Encodings */
60enum vmcs_field { 67enum vmcs_field {
61 GUEST_ES_SELECTOR = 0x00000800, 68 GUEST_ES_SELECTOR = 0x00000800,
@@ -89,6 +96,8 @@ enum vmcs_field {
89 TSC_OFFSET_HIGH = 0x00002011, 96 TSC_OFFSET_HIGH = 0x00002011,
90 VIRTUAL_APIC_PAGE_ADDR = 0x00002012, 97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
91 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
92 VMCS_LINK_POINTER = 0x00002800, 101 VMCS_LINK_POINTER = 0x00002800,
93 VMCS_LINK_POINTER_HIGH = 0x00002801, 102 VMCS_LINK_POINTER_HIGH = 0x00002801,
94 GUEST_IA32_DEBUGCTL = 0x00002802, 103 GUEST_IA32_DEBUGCTL = 0x00002802,
@@ -214,6 +223,8 @@ enum vmcs_field {
214#define EXIT_REASON_MSR_WRITE 32 223#define EXIT_REASON_MSR_WRITE 32
215#define EXIT_REASON_MWAIT_INSTRUCTION 36 224#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
217 228
218/* 229/*
219 * Interruption-information format 230 * Interruption-information format
@@ -230,13 +241,14 @@ enum vmcs_field {
230 241
231#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
232#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
233 245
234/* 246/*
235 * Exit Qualifications for MOV for Control Register Access 247 * Exit Qualifications for MOV for Control Register Access
236 */ 248 */
237#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ 249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
238#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ 250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
239#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ 251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
240#define LMSW_SOURCE_DATA_SHIFT 16 252#define LMSW_SOURCE_DATA_SHIFT 16
241#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ 253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
242#define REG_EAX (0 << 8) 254#define REG_EAX (0 << 8)
@@ -259,11 +271,11 @@ enum vmcs_field {
259/* 271/*
260 * Exit Qualifications for MOV for Debug Register Access 272 * Exit Qualifications for MOV for Debug Register Access
261 */ 273 */
262#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ 274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
263#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ 275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
264#define TYPE_MOV_TO_DR (0 << 4) 276#define TYPE_MOV_TO_DR (0 << 4)
265#define TYPE_MOV_FROM_DR (1 << 4) 277#define TYPE_MOV_FROM_DR (1 << 4)
266#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ 278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
267 279
268 280
269/* segment AR */ 281/* segment AR */
@@ -307,4 +319,6 @@ enum vmcs_field {
307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
309 321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
310#endif 324#endif
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c
index c0f372f1d761..8f94a0b89dff 100644
--- a/drivers/kvm/kvm_main.c
+++ b/arch/x86/kvm/x86.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * derived from drivers/kvm/kvm_main.c
5 * machines without emulation or binary translation.
6 * 5 *
7 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
8 * 7 *
@@ -15,80 +14,22 @@
15 * 14 *
16 */ 15 */
17 16
18#include "kvm.h" 17#include <linux/kvm_host.h>
19#include "x86_emulate.h"
20#include "segment_descriptor.h" 18#include "segment_descriptor.h"
21#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
22 21
23#include <linux/kvm.h> 22#include <linux/kvm.h>
24#include <linux/module.h> 23#include <linux/fs.h>
25#include <linux/errno.h>
26#include <linux/percpu.h>
27#include <linux/gfp.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
31#include <linux/reboot.h> 25#include <linux/module.h>
32#include <linux/debugfs.h> 26#include <linux/mman.h>
33#include <linux/highmem.h> 27#include <linux/highmem.h>
34#include <linux/file.h>
35#include <linux/sysdev.h>
36#include <linux/cpu.h>
37#include <linux/sched.h>
38#include <linux/cpumask.h>
39#include <linux/smp.h>
40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/desc.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51 28
52static DEFINE_SPINLOCK(kvm_lock); 29#include <asm/uaccess.h>
53static LIST_HEAD(vm_list); 30#include <asm/msr.h>
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
60
61static __read_mostly struct preempt_ops kvm_preempt_ops;
62
63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
64
65static struct kvm_stats_debugfs_item {
66 const char *name;
67 int offset;
68 struct dentry *dentry;
69} debugfs_entries[] = {
70 { "pf_fixed", STAT_OFFSET(pf_fixed) },
71 { "pf_guest", STAT_OFFSET(pf_guest) },
72 { "tlb_flush", STAT_OFFSET(tlb_flush) },
73 { "invlpg", STAT_OFFSET(invlpg) },
74 { "exits", STAT_OFFSET(exits) },
75 { "io_exits", STAT_OFFSET(io_exits) },
76 { "mmio_exits", STAT_OFFSET(mmio_exits) },
77 { "signal_exits", STAT_OFFSET(signal_exits) },
78 { "irq_window", STAT_OFFSET(irq_window_exits) },
79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
81 { "request_irq", STAT_OFFSET(request_irq_exits) },
82 { "irq_exits", STAT_OFFSET(irq_exits) },
83 { "light_exits", STAT_OFFSET(light_exits) },
84 { "efer_reload", STAT_OFFSET(efer_reload) },
85 { NULL }
86};
87
88static struct dentry *debugfs_dir;
89 31
90#define MAX_IO_MSRS 256 32#define MAX_IO_MSRS 256
91
92#define CR0_RESERVED_BITS \ 33#define CR0_RESERVED_BITS \
93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir;
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
103#define EFER_RESERVED_BITS 0xfffffffffffff2fe 44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
104 45
105#ifdef CONFIG_X86_64 46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
106// LDT or TSS descriptor in the GDT. 16 bytes. 47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
107struct segment_descriptor_64 {
108 struct segment_descriptor s;
109 u32 base_higher;
110 u32 pad_zero;
111};
112 48
113#endif 49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
77 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
78 { NULL }
79};
114 80
115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117 81
118unsigned long segment_base(u16 selector) 82unsigned long segment_base(u16 selector)
119{ 83{
120 struct descriptor_table gdt; 84 struct descriptor_table gdt;
121 struct segment_descriptor *d; 85 struct segment_descriptor *d;
122 unsigned long table_base; 86 unsigned long table_base;
123 typedef unsigned long ul;
124 unsigned long v; 87 unsigned long v;
125 88
126 if (selector == 0) 89 if (selector == 0)
127 return 0; 90 return 0;
128 91
129 asm ("sgdt %0" : "=m"(gdt)); 92 asm("sgdt %0" : "=m"(gdt));
130 table_base = gdt.base; 93 table_base = gdt.base;
131 94
132 if (selector & 4) { /* from ldt */ 95 if (selector & 4) { /* from ldt */
133 u16 ldt_selector; 96 u16 ldt_selector;
134 97
135 asm ("sldt %0" : "=g"(ldt_selector)); 98 asm("sldt %0" : "=g"(ldt_selector));
136 table_base = segment_base(ldt_selector); 99 table_base = segment_base(ldt_selector);
137 } 100 }
138 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 101 d = (struct segment_descriptor *)(table_base + (selector & ~7));
139 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); 102 v = d->base_low | ((unsigned long)d->base_mid << 16) |
103 ((unsigned long)d->base_high << 24);
140#ifdef CONFIG_X86_64 104#ifdef CONFIG_X86_64
141 if (d->system == 0 105 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
142 && (d->type == 2 || d->type == 9 || d->type == 11)) 106 v |= ((unsigned long) \
143 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; 107 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
144#endif 108#endif
145 return v; 109 return v;
146} 110}
147EXPORT_SYMBOL_GPL(segment_base); 111EXPORT_SYMBOL_GPL(segment_base);
148 112
149static inline int valid_vcpu(int n) 113u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
150{
151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
152}
153
154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
155{
156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
157 return;
158
159 vcpu->guest_fpu_loaded = 1;
160 fx_save(&vcpu->host_fx_image);
161 fx_restore(&vcpu->guest_fx_image);
162}
163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
164
165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
166{
167 if (!vcpu->guest_fpu_loaded)
168 return;
169
170 vcpu->guest_fpu_loaded = 0;
171 fx_save(&vcpu->guest_fx_image);
172 fx_restore(&vcpu->host_fx_image);
173}
174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
175
176/*
177 * Switches to specified vcpu, until a matching vcpu_put()
178 */
179static void vcpu_load(struct kvm_vcpu *vcpu)
180{
181 int cpu;
182
183 mutex_lock(&vcpu->mutex);
184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
188}
189
190static void vcpu_put(struct kvm_vcpu *vcpu)
191{
192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
196 mutex_unlock(&vcpu->mutex);
197}
198
199static void ack_flush(void *_completed)
200{
201}
202
203void kvm_flush_remote_tlbs(struct kvm *kvm)
204{
205 int i, cpu;
206 cpumask_t cpus;
207 struct kvm_vcpu *vcpu;
208
209 cpus_clear(cpus);
210 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
211 vcpu = kvm->vcpus[i];
212 if (!vcpu)
213 continue;
214 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
215 continue;
216 cpu = vcpu->cpu;
217 if (cpu != -1 && cpu != raw_smp_processor_id())
218 cpu_set(cpu, cpus);
219 }
220 smp_call_function_mask(cpus, ack_flush, NULL, 1);
221}
222
223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
224{ 114{
225 struct page *page; 115 if (irqchip_in_kernel(vcpu->kvm))
226 int r; 116 return vcpu->arch.apic_base;
227
228 mutex_init(&vcpu->mutex);
229 vcpu->cpu = -1;
230 vcpu->mmu.root_hpa = INVALID_PAGE;
231 vcpu->kvm = kvm;
232 vcpu->vcpu_id = id;
233 if (!irqchip_in_kernel(kvm) || id == 0)
234 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
235 else 117 else
236 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; 118 return vcpu->arch.apic_base;
237 init_waitqueue_head(&vcpu->wq);
238
239 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
240 if (!page) {
241 r = -ENOMEM;
242 goto fail;
243 }
244 vcpu->run = page_address(page);
245
246 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
247 if (!page) {
248 r = -ENOMEM;
249 goto fail_free_run;
250 }
251 vcpu->pio_data = page_address(page);
252
253 r = kvm_mmu_create(vcpu);
254 if (r < 0)
255 goto fail_free_pio_data;
256
257 return 0;
258
259fail_free_pio_data:
260 free_page((unsigned long)vcpu->pio_data);
261fail_free_run:
262 free_page((unsigned long)vcpu->run);
263fail:
264 return -ENOMEM;
265}
266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
267
268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
269{
270 kvm_mmu_destroy(vcpu);
271 if (vcpu->apic)
272 hrtimer_cancel(&vcpu->apic->timer.dev);
273 kvm_free_apic(vcpu->apic);
274 free_page((unsigned long)vcpu->pio_data);
275 free_page((unsigned long)vcpu->run);
276}
277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
278
279static struct kvm *kvm_create_vm(void)
280{
281 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
282
283 if (!kvm)
284 return ERR_PTR(-ENOMEM);
285
286 kvm_io_bus_init(&kvm->pio_bus);
287 mutex_init(&kvm->lock);
288 INIT_LIST_HEAD(&kvm->active_mmu_pages);
289 kvm_io_bus_init(&kvm->mmio_bus);
290 spin_lock(&kvm_lock);
291 list_add(&kvm->vm_list, &vm_list);
292 spin_unlock(&kvm_lock);
293 return kvm;
294}
295
296/*
297 * Free any memory in @free but not in @dont.
298 */
299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
300 struct kvm_memory_slot *dont)
301{
302 int i;
303
304 if (!dont || free->phys_mem != dont->phys_mem)
305 if (free->phys_mem) {
306 for (i = 0; i < free->npages; ++i)
307 if (free->phys_mem[i])
308 __free_page(free->phys_mem[i]);
309 vfree(free->phys_mem);
310 }
311
312 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
313 vfree(free->dirty_bitmap);
314
315 free->phys_mem = NULL;
316 free->npages = 0;
317 free->dirty_bitmap = NULL;
318}
319
320static void kvm_free_physmem(struct kvm *kvm)
321{
322 int i;
323
324 for (i = 0; i < kvm->nmemslots; ++i)
325 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
326} 119}
120EXPORT_SYMBOL_GPL(kvm_get_apic_base);
327 121
328static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 122void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
329{ 123{
330 int i; 124 /* TODO: reserve bits check */
331 125 if (irqchip_in_kernel(vcpu->kvm))
332 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) 126 kvm_lapic_set_base(vcpu, data);
333 if (vcpu->pio.guest_pages[i]) { 127 else
334 __free_page(vcpu->pio.guest_pages[i]); 128 vcpu->arch.apic_base = data;
335 vcpu->pio.guest_pages[i] = NULL;
336 }
337} 129}
130EXPORT_SYMBOL_GPL(kvm_set_apic_base);
338 131
339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 132void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
340{ 133{
341 vcpu_load(vcpu); 134 WARN_ON(vcpu->arch.exception.pending);
342 kvm_mmu_unload(vcpu); 135 vcpu->arch.exception.pending = true;
343 vcpu_put(vcpu); 136 vcpu->arch.exception.has_error_code = false;
137 vcpu->arch.exception.nr = nr;
344} 138}
139EXPORT_SYMBOL_GPL(kvm_queue_exception);
345 140
346static void kvm_free_vcpus(struct kvm *kvm) 141void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
142 u32 error_code)
347{ 143{
348 unsigned int i; 144 ++vcpu->stat.pf_guest;
349 145 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
350 /* 146 printk(KERN_DEBUG "kvm: inject_page_fault:"
351 * Unpin any mmu pages first. 147 " double fault 0x%lx\n", addr);
352 */ 148 vcpu->arch.exception.nr = DF_VECTOR;
353 for (i = 0; i < KVM_MAX_VCPUS; ++i) 149 vcpu->arch.exception.error_code = 0;
354 if (kvm->vcpus[i]) 150 return;
355 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
356 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
357 if (kvm->vcpus[i]) {
358 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
359 kvm->vcpus[i] = NULL;
360 }
361 } 151 }
362 152 vcpu->arch.cr2 = addr;
153 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
363} 154}
364 155
365static void kvm_destroy_vm(struct kvm *kvm) 156void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
366{ 157{
367 spin_lock(&kvm_lock); 158 WARN_ON(vcpu->arch.exception.pending);
368 list_del(&kvm->vm_list); 159 vcpu->arch.exception.pending = true;
369 spin_unlock(&kvm_lock); 160 vcpu->arch.exception.has_error_code = true;
370 kvm_io_bus_destroy(&kvm->pio_bus); 161 vcpu->arch.exception.nr = nr;
371 kvm_io_bus_destroy(&kvm->mmio_bus); 162 vcpu->arch.exception.error_code = error_code;
372 kfree(kvm->vpic);
373 kfree(kvm->vioapic);
374 kvm_free_vcpus(kvm);
375 kvm_free_physmem(kvm);
376 kfree(kvm);
377} 163}
164EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
378 165
379static int kvm_vm_release(struct inode *inode, struct file *filp) 166static void __queue_exception(struct kvm_vcpu *vcpu)
380{ 167{
381 struct kvm *kvm = filp->private_data; 168 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
382 169 vcpu->arch.exception.has_error_code,
383 kvm_destroy_vm(kvm); 170 vcpu->arch.exception.error_code);
384 return 0;
385}
386
387static void inject_gp(struct kvm_vcpu *vcpu)
388{
389 kvm_x86_ops->inject_gp(vcpu, 0);
390} 171}
391 172
392/* 173/*
393 * Load the pae pdptrs. Return true is they are all valid. 174 * Load the pae pdptrs. Return true is they are all valid.
394 */ 175 */
395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 176int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
396{ 177{
397 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 178 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
398 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 179 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
399 int i; 180 int i;
400 u64 *pdpt;
401 int ret; 181 int ret;
402 struct page *page; 182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
403 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
404 183
405 mutex_lock(&vcpu->kvm->lock); 184 down_read(&current->mm->mmap_sem);
406 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
407 if (!page) { 186 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) {
408 ret = 0; 188 ret = 0;
409 goto out; 189 goto out;
410 } 190 }
411
412 pdpt = kmap_atomic(page, KM_USER0);
413 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
414 kunmap_atomic(pdpt, KM_USER0);
415
416 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 191 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
417 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 192 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
418 ret = 0; 193 ret = 0;
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
421 } 196 }
422 ret = 1; 197 ret = 1;
423 198
424 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); 199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
425out: 200out:
426 mutex_unlock(&vcpu->kvm->lock); 201 up_read(&current->mm->mmap_sem);
427 202
428 return ret; 203 return ret;
429} 204}
430 205
206static bool pdptrs_changed(struct kvm_vcpu *vcpu)
207{
208 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
209 bool changed = true;
210 int r;
211
212 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false;
214
215 down_read(&current->mm->mmap_sem);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0)
218 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out:
221 up_read(&current->mm->mmap_sem);
222
223 return changed;
224}
225
431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 226void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
432{ 227{
433 if (cr0 & CR0_RESERVED_BITS) { 228 if (cr0 & CR0_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 229 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
435 cr0, vcpu->cr0); 230 cr0, vcpu->arch.cr0);
436 inject_gp(vcpu); 231 kvm_inject_gp(vcpu, 0);
437 return; 232 return;
438 } 233 }
439 234
440 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 235 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
441 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 236 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
442 inject_gp(vcpu); 237 kvm_inject_gp(vcpu, 0);
443 return; 238 return;
444 } 239 }
445 240
446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 241 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
447 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 242 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
448 "and a clear PE flag\n"); 243 "and a clear PE flag\n");
449 inject_gp(vcpu); 244 kvm_inject_gp(vcpu, 0);
450 return; 245 return;
451 } 246 }
452 247
453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 248 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
454#ifdef CONFIG_X86_64 249#ifdef CONFIG_X86_64
455 if ((vcpu->shadow_efer & EFER_LME)) { 250 if ((vcpu->arch.shadow_efer & EFER_LME)) {
456 int cs_db, cs_l; 251 int cs_db, cs_l;
457 252
458 if (!is_pae(vcpu)) { 253 if (!is_pae(vcpu)) {
459 printk(KERN_DEBUG "set_cr0: #GP, start paging " 254 printk(KERN_DEBUG "set_cr0: #GP, start paging "
460 "in long mode while PAE is disabled\n"); 255 "in long mode while PAE is disabled\n");
461 inject_gp(vcpu); 256 kvm_inject_gp(vcpu, 0);
462 return; 257 return;
463 } 258 }
464 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 259 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
465 if (cs_l) { 260 if (cs_l) {
466 printk(KERN_DEBUG "set_cr0: #GP, start paging " 261 printk(KERN_DEBUG "set_cr0: #GP, start paging "
467 "in long mode while CS.L == 1\n"); 262 "in long mode while CS.L == 1\n");
468 inject_gp(vcpu); 263 kvm_inject_gp(vcpu, 0);
469 return; 264 return;
470 265
471 } 266 }
472 } else 267 } else
473#endif 268#endif
474 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { 269 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
475 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 270 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
476 "reserved bits\n"); 271 "reserved bits\n");
477 inject_gp(vcpu); 272 kvm_inject_gp(vcpu, 0);
478 return; 273 return;
479 } 274 }
480 275
481 } 276 }
482 277
483 kvm_x86_ops->set_cr0(vcpu, cr0); 278 kvm_x86_ops->set_cr0(vcpu, cr0);
484 vcpu->cr0 = cr0; 279 vcpu->arch.cr0 = cr0;
485 280
486 mutex_lock(&vcpu->kvm->lock);
487 kvm_mmu_reset_context(vcpu); 281 kvm_mmu_reset_context(vcpu);
488 mutex_unlock(&vcpu->kvm->lock);
489 return; 282 return;
490} 283}
491EXPORT_SYMBOL_GPL(set_cr0); 284EXPORT_SYMBOL_GPL(set_cr0);
492 285
493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 286void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
494{ 287{
495 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 288 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
496} 289}
497EXPORT_SYMBOL_GPL(lmsw); 290EXPORT_SYMBOL_GPL(lmsw);
498 291
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
500{ 293{
501 if (cr4 & CR4_RESERVED_BITS) { 294 if (cr4 & CR4_RESERVED_BITS) {
502 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 295 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
503 inject_gp(vcpu); 296 kvm_inject_gp(vcpu, 0);
504 return; 297 return;
505 } 298 }
506 299
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
508 if (!(cr4 & X86_CR4_PAE)) { 301 if (!(cr4 & X86_CR4_PAE)) {
509 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 302 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
510 "in long mode\n"); 303 "in long mode\n");
511 inject_gp(vcpu); 304 kvm_inject_gp(vcpu, 0);
512 return; 305 return;
513 } 306 }
514 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 307 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
515 && !load_pdptrs(vcpu, vcpu->cr3)) { 308 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
516 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 309 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
517 inject_gp(vcpu); 310 kvm_inject_gp(vcpu, 0);
518 return; 311 return;
519 } 312 }
520 313
521 if (cr4 & X86_CR4_VMXE) { 314 if (cr4 & X86_CR4_VMXE) {
522 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 315 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
523 inject_gp(vcpu); 316 kvm_inject_gp(vcpu, 0);
524 return; 317 return;
525 } 318 }
526 kvm_x86_ops->set_cr4(vcpu, cr4); 319 kvm_x86_ops->set_cr4(vcpu, cr4);
527 vcpu->cr4 = cr4; 320 vcpu->arch.cr4 = cr4;
528 mutex_lock(&vcpu->kvm->lock);
529 kvm_mmu_reset_context(vcpu); 321 kvm_mmu_reset_context(vcpu);
530 mutex_unlock(&vcpu->kvm->lock);
531} 322}
532EXPORT_SYMBOL_GPL(set_cr4); 323EXPORT_SYMBOL_GPL(set_cr4);
533 324
534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 325void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
535{ 326{
327 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
328 kvm_mmu_flush_tlb(vcpu);
329 return;
330 }
331
536 if (is_long_mode(vcpu)) { 332 if (is_long_mode(vcpu)) {
537 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 333 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
538 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 334 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
539 inject_gp(vcpu); 335 kvm_inject_gp(vcpu, 0);
540 return; 336 return;
541 } 337 }
542 } else { 338 } else {
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
544 if (cr3 & CR3_PAE_RESERVED_BITS) { 340 if (cr3 & CR3_PAE_RESERVED_BITS) {
545 printk(KERN_DEBUG 341 printk(KERN_DEBUG
546 "set_cr3: #GP, reserved bits\n"); 342 "set_cr3: #GP, reserved bits\n");
547 inject_gp(vcpu); 343 kvm_inject_gp(vcpu, 0);
548 return; 344 return;
549 } 345 }
550 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 346 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
551 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 347 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
552 "reserved bits\n"); 348 "reserved bits\n");
553 inject_gp(vcpu); 349 kvm_inject_gp(vcpu, 0);
554 return;
555 }
556 } else {
557 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 inject_gp(vcpu);
561 return; 350 return;
562 } 351 }
563 } 352 }
353 /*
354 * We don't check reserved bits in nonpae mode, because
355 * this isn't enforced, and VMware depends on this.
356 */
564 } 357 }
565 358
566 mutex_lock(&vcpu->kvm->lock); 359 down_read(&current->mm->mmap_sem);
567 /* 360 /*
568 * Does the new cr3 value map to physical memory? (Note, we 361 * Does the new cr3 value map to physical memory? (Note, we
569 * catch an invalid cr3 even in real-mode, because it would 362 * catch an invalid cr3 even in real-mode, because it would
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
574 * to debug) behavior on the guest side. 367 * to debug) behavior on the guest side.
575 */ 368 */
576 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 369 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
577 inject_gp(vcpu); 370 kvm_inject_gp(vcpu, 0);
578 else { 371 else {
579 vcpu->cr3 = cr3; 372 vcpu->arch.cr3 = cr3;
580 vcpu->mmu.new_cr3(vcpu); 373 vcpu->arch.mmu.new_cr3(vcpu);
581 } 374 }
582 mutex_unlock(&vcpu->kvm->lock); 375 up_read(&current->mm->mmap_sem);
583} 376}
584EXPORT_SYMBOL_GPL(set_cr3); 377EXPORT_SYMBOL_GPL(set_cr3);
585 378
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
587{ 380{
588 if (cr8 & CR8_RESERVED_BITS) { 381 if (cr8 & CR8_RESERVED_BITS) {
589 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 382 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
590 inject_gp(vcpu); 383 kvm_inject_gp(vcpu, 0);
591 return; 384 return;
592 } 385 }
593 if (irqchip_in_kernel(vcpu->kvm)) 386 if (irqchip_in_kernel(vcpu->kvm))
594 kvm_lapic_set_tpr(vcpu, cr8); 387 kvm_lapic_set_tpr(vcpu, cr8);
595 else 388 else
596 vcpu->cr8 = cr8; 389 vcpu->arch.cr8 = cr8;
597} 390}
598EXPORT_SYMBOL_GPL(set_cr8); 391EXPORT_SYMBOL_GPL(set_cr8);
599 392
@@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
602 if (irqchip_in_kernel(vcpu->kvm)) 395 if (irqchip_in_kernel(vcpu->kvm))
603 return kvm_lapic_get_cr8(vcpu); 396 return kvm_lapic_get_cr8(vcpu);
604 else 397 else
605 return vcpu->cr8; 398 return vcpu->arch.cr8;
606} 399}
607EXPORT_SYMBOL_GPL(get_cr8); 400EXPORT_SYMBOL_GPL(get_cr8);
608 401
609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 402/*
403 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
404 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
405 *
406 * This list is modified at module load time to reflect the
407 * capabilities of the host cpu.
408 */
409static u32 msrs_to_save[] = {
410 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
411 MSR_K6_STAR,
412#ifdef CONFIG_X86_64
413 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
414#endif
415 MSR_IA32_TIME_STAMP_COUNTER,
416};
417
418static unsigned num_msrs_to_save;
419
420static u32 emulated_msrs[] = {
421 MSR_IA32_MISC_ENABLE,
422};
423
424#ifdef CONFIG_X86_64
425
426static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
610{ 427{
611 if (irqchip_in_kernel(vcpu->kvm)) 428 if (efer & EFER_RESERVED_BITS) {
612 return vcpu->apic_base; 429 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
613 else 430 efer);
614 return vcpu->apic_base; 431 kvm_inject_gp(vcpu, 0);
432 return;
433 }
434
435 if (is_paging(vcpu)
436 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
437 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
438 kvm_inject_gp(vcpu, 0);
439 return;
440 }
441
442 kvm_x86_ops->set_efer(vcpu, efer);
443
444 efer &= ~EFER_LMA;
445 efer |= vcpu->arch.shadow_efer & EFER_LMA;
446
447 vcpu->arch.shadow_efer = efer;
615} 448}
616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
617 449
618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 450#endif
451
452/*
453 * Writes msr value into into the appropriate "register".
454 * Returns 0 on success, non-0 otherwise.
455 * Assumes vcpu_load() was already called.
456 */
457int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
619{ 458{
620 /* TODO: reserve bits check */ 459 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
621 if (irqchip_in_kernel(vcpu->kvm))
622 kvm_lapic_set_base(vcpu, data);
623 else
624 vcpu->apic_base = data;
625} 460}
626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
627 461
628void fx_init(struct kvm_vcpu *vcpu) 462/*
463 * Adapt set_msr() to msr_io()'s calling convention
464 */
465static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
629{ 466{
630 unsigned after_mxcsr_mask; 467 return kvm_set_msr(vcpu, index, *data);
468}
631 469
632 /* Initialize guest FPU by resetting ours and saving into guest's */
633 preempt_disable();
634 fx_save(&vcpu->host_fx_image);
635 fpu_init();
636 fx_save(&vcpu->guest_fx_image);
637 fx_restore(&vcpu->host_fx_image);
638 preempt_enable();
639 470
640 vcpu->cr0 |= X86_CR0_ET; 471int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
641 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 472{
642 vcpu->guest_fx_image.mxcsr = 0x1f80; 473 switch (msr) {
643 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, 474#ifdef CONFIG_X86_64
644 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 475 case MSR_EFER:
476 set_efer(vcpu, data);
477 break;
478#endif
479 case MSR_IA32_MC0_STATUS:
480 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
481 __FUNCTION__, data);
482 break;
483 case MSR_IA32_MCG_STATUS:
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data);
486 break;
487 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */
490 break;
491 case MSR_IA32_APICBASE:
492 kvm_set_apic_base(vcpu, data);
493 break;
494 case MSR_IA32_MISC_ENABLE:
495 vcpu->arch.ia32_misc_enable_msr = data;
496 break;
497 default:
498 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
499 return 1;
500 }
501 return 0;
645} 502}
646EXPORT_SYMBOL_GPL(fx_init); 503EXPORT_SYMBOL_GPL(kvm_set_msr_common);
504
505
506/*
507 * Reads an msr value (of 'msr_index') into 'pdata'.
508 * Returns 0 on success, non-0 otherwise.
509 * Assumes vcpu_load() was already called.
510 */
511int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
512{
513 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
514}
515
516int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
517{
518 u64 data;
519
520 switch (msr) {
521 case 0xc0010010: /* SYSCFG */
522 case 0xc0010015: /* HWCR */
523 case MSR_IA32_PLATFORM_ID:
524 case MSR_IA32_P5_MC_ADDR:
525 case MSR_IA32_P5_MC_TYPE:
526 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP:
529 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8:
532 case MSR_IA32_MC0_MISC+12:
533 case MSR_IA32_MC0_MISC+16:
534 case MSR_IA32_UCODE_REV:
535 case MSR_IA32_PERF_STATUS:
536 case MSR_IA32_EBL_CR_POWERON:
537 /* MTRR registers */
538 case 0xfe:
539 case 0x200 ... 0x2ff:
540 data = 0;
541 break;
542 case 0xcd: /* fsb frequency */
543 data = 3;
544 break;
545 case MSR_IA32_APICBASE:
546 data = kvm_get_apic_base(vcpu);
547 break;
548 case MSR_IA32_MISC_ENABLE:
549 data = vcpu->arch.ia32_misc_enable_msr;
550 break;
551#ifdef CONFIG_X86_64
552 case MSR_EFER:
553 data = vcpu->arch.shadow_efer;
554 break;
555#endif
556 default:
557 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
558 return 1;
559 }
560 *pdata = data;
561 return 0;
562}
563EXPORT_SYMBOL_GPL(kvm_get_msr_common);
647 564
648/* 565/*
649 * Allocate some memory and give it an address in the guest physical address 566 * Read or write a bunch of msrs. All parameters are kernel addresses.
650 * space.
651 * 567 *
652 * Discontiguous memory is allowed, mostly for framebuffers. 568 * @return number of msrs set successfully.
653 */ 569 */
654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 570static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
655 struct kvm_memory_region *mem) 571 struct kvm_msr_entry *entries,
572 int (*do_msr)(struct kvm_vcpu *vcpu,
573 unsigned index, u64 *data))
656{ 574{
657 int r; 575 int i;
658 gfn_t base_gfn;
659 unsigned long npages;
660 unsigned long i;
661 struct kvm_memory_slot *memslot;
662 struct kvm_memory_slot old, new;
663 576
664 r = -EINVAL; 577 vcpu_load(vcpu);
665 /* General sanity checks */ 578
666 if (mem->memory_size & (PAGE_SIZE - 1)) 579 for (i = 0; i < msrs->nmsrs; ++i)
667 goto out; 580 if (do_msr(vcpu, entries[i].index, &entries[i].data))
668 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 581 break;
582
583 vcpu_put(vcpu);
584
585 return i;
586}
587
588/*
589 * Read or write a bunch of msrs. Parameters are user addresses.
590 *
591 * @return number of msrs set successfully.
592 */
593static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
594 int (*do_msr)(struct kvm_vcpu *vcpu,
595 unsigned index, u64 *data),
596 int writeback)
597{
598 struct kvm_msrs msrs;
599 struct kvm_msr_entry *entries;
600 int r, n;
601 unsigned size;
602
603 r = -EFAULT;
604 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
669 goto out; 605 goto out;
670 if (mem->slot >= KVM_MEMORY_SLOTS) 606
607 r = -E2BIG;
608 if (msrs.nmsrs >= MAX_IO_MSRS)
671 goto out; 609 goto out;
672 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 610
611 r = -ENOMEM;
612 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
613 entries = vmalloc(size);
614 if (!entries)
673 goto out; 615 goto out;
674 616
675 memslot = &kvm->memslots[mem->slot]; 617 r = -EFAULT;
676 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 618 if (copy_from_user(entries, user_msrs->entries, size))
677 npages = mem->memory_size >> PAGE_SHIFT; 619 goto out_free;
678 620
679 if (!npages) 621 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
680 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 622 if (r < 0)
623 goto out_free;
681 624
682 mutex_lock(&kvm->lock); 625 r = -EFAULT;
626 if (writeback && copy_to_user(user_msrs->entries, entries, size))
627 goto out_free;
683 628
684 new = old = *memslot; 629 r = n;
685 630
686 new.base_gfn = base_gfn; 631out_free:
687 new.npages = npages; 632 vfree(entries);
688 new.flags = mem->flags; 633out:
634 return r;
635}
689 636
690 /* Disallow changing a memory slot's size. */ 637/*
691 r = -EINVAL; 638 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
692 if (npages && old.npages && npages != old.npages) 639 * cached on it.
693 goto out_unlock; 640 */
641void decache_vcpus_on_cpu(int cpu)
642{
643 struct kvm *vm;
644 struct kvm_vcpu *vcpu;
645 int i;
694 646
695 /* Check for overlaps */ 647 spin_lock(&kvm_lock);
696 r = -EEXIST; 648 list_for_each_entry(vm, &vm_list, vm_list)
697 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 649 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
698 struct kvm_memory_slot *s = &kvm->memslots[i]; 650 vcpu = vm->vcpus[i];
651 if (!vcpu)
652 continue;
653 /*
654 * If the vcpu is locked, then it is running on some
655 * other cpu and therefore it is not cached on the
656 * cpu in question.
657 *
658 * If it's not locked, check the last cpu it executed
659 * on.
660 */
661 if (mutex_trylock(&vcpu->mutex)) {
662 if (vcpu->cpu == cpu) {
663 kvm_x86_ops->vcpu_decache(vcpu);
664 vcpu->cpu = -1;
665 }
666 mutex_unlock(&vcpu->mutex);
667 }
668 }
669 spin_unlock(&kvm_lock);
670}
699 671
700 if (s == memslot) 672int kvm_dev_ioctl_check_extension(long ext)
701 continue; 673{
702 if (!((base_gfn + npages <= s->base_gfn) || 674 int r;
703 (base_gfn >= s->base_gfn + s->npages))) 675
704 goto out_unlock; 676 switch (ext) {
677 case KVM_CAP_IRQCHIP:
678 case KVM_CAP_HLT:
679 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
680 case KVM_CAP_USER_MEMORY:
681 case KVM_CAP_SET_TSS_ADDR:
682 case KVM_CAP_EXT_CPUID:
683 r = 1;
684 break;
685 case KVM_CAP_VAPIC:
686 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
687 break;
688 default:
689 r = 0;
690 break;
705 } 691 }
692 return r;
706 693
707 /* Deallocate if slot is being removed */ 694}
708 if (!npages)
709 new.phys_mem = NULL;
710 695
711 /* Free page dirty bitmap if unneeded */ 696long kvm_arch_dev_ioctl(struct file *filp,
712 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 697 unsigned int ioctl, unsigned long arg)
713 new.dirty_bitmap = NULL; 698{
699 void __user *argp = (void __user *)arg;
700 long r;
714 701
715 r = -ENOMEM; 702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
716 707
717 /* Allocate if a slot is being created */ 708 r = -EFAULT;
718 if (npages && !new.phys_mem) { 709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
719 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
720 741
721 if (!new.phys_mem) 742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
722 goto out_unlock; 743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
723 747
724 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 748static int is_efer_nx(void)
725 for (i = 0; i < npages; ++i) { 749{
726 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 750 u64 efer;
727 | __GFP_ZERO); 751
728 if (!new.phys_mem[i]) 752 rdmsrl(MSR_EFER, efer);
729 goto out_unlock; 753 return efer & EFER_NX;
730 set_page_private(new.phys_mem[i],0); 754}
731 }
732 }
733 755
734 /* Allocate page dirty bitmap if needed */ 756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
735 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 757{
736 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
737 760
738 new.dirty_bitmap = vmalloc(dirty_bytes); 761 entry = NULL;
739 if (!new.dirty_bitmap) 762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
740 goto out_unlock; 763 e = &vcpu->arch.cpuid_entries[i];
741 memset(new.dirty_bitmap, 0, dirty_bytes); 764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
742 } 768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
743 774
744 if (mem->slot >= kvm->nmemslots) 775/* when an old userspace process fills a new kernel module */
745 kvm->nmemslots = mem->slot + 1; 776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
746 782
747 *memslot = new; 783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
748 809
749 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 810out_free:
750 kvm_flush_remote_tlbs(kvm); 811 vfree(cpuid_entries);
812out:
813 return r;
814}
751 815
752 mutex_unlock(&kvm->lock); 816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
753 821
754 kvm_free_physmem_slot(&old, &new); 822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
755 return 0; 830 return 0;
756 831
757out_unlock:
758 mutex_unlock(&kvm->lock);
759 kvm_free_physmem_slot(&new, &old);
760out: 832out:
761 return r; 833 return r;
762} 834}
763 835
764/* 836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
765 * Get (and clear) the dirty memory log for a memory slot. 837 struct kvm_cpuid2 *cpuid,
766 */ 838 struct kvm_cpuid_entry2 __user *entries)
767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
768 struct kvm_dirty_log *log)
769{ 839{
770 struct kvm_memory_slot *memslot; 840 int r;
771 int r, i;
772 int n;
773 unsigned long any = 0;
774
775 mutex_lock(&kvm->lock);
776 841
777 r = -EINVAL; 842 r = -E2BIG;
778 if (log->slot >= KVM_MEMORY_SLOTS) 843 if (cpuid->nent < vcpu->arch.cpuid_nent)
779 goto out; 844 goto out;
780 845 r = -EFAULT;
781 memslot = &kvm->memslots[log->slot]; 846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
782 r = -ENOENT; 847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
783 if (!memslot->dirty_bitmap)
784 goto out; 848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
785 976
786 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
787 984
788 for (i = 0; !any && i < n/sizeof(long); ++i) 985 if (cpuid->nent < 1)
789 any = memslot->dirty_bitmap[i]; 986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
790 991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
791 r = -EFAULT; 1006 r = -EFAULT;
792 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1007 if (copy_to_user(entries, cpuid_entries,
793 goto out; 1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
794 1012
795 /* If nothing is dirty, don't bother messing with page tables. */ 1013out_free:
796 if (any) { 1014 vfree(cpuid_entries);
797 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1015out:
798 kvm_flush_remote_tlbs(kvm); 1016 return r;
799 memset(memslot->dirty_bitmap, 0, n); 1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1058 struct kvm_tpr_access_ctl *tac)
1059{
1060 if (tac->flags)
1061 return -EINVAL;
1062 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1063 return 0;
1064}
1065
1066long kvm_arch_vcpu_ioctl(struct file *filp,
1067 unsigned int ioctl, unsigned long arg)
1068{
1069 struct kvm_vcpu *vcpu = filp->private_data;
1070 void __user *argp = (void __user *)arg;
1071 int r;
1072
1073 switch (ioctl) {
1074 case KVM_GET_LAPIC: {
1075 struct kvm_lapic_state lapic;
1076
1077 memset(&lapic, 0, sizeof lapic);
1078 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1079 if (r)
1080 goto out;
1081 r = -EFAULT;
1082 if (copy_to_user(argp, &lapic, sizeof lapic))
1083 goto out;
1084 r = 0;
1085 break;
800 } 1086 }
1087 case KVM_SET_LAPIC: {
1088 struct kvm_lapic_state lapic;
801 1089
802 r = 0; 1090 r = -EFAULT;
1091 if (copy_from_user(&lapic, argp, sizeof lapic))
1092 goto out;
1093 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1094 if (r)
1095 goto out;
1096 r = 0;
1097 break;
1098 }
1099 case KVM_INTERRUPT: {
1100 struct kvm_interrupt irq;
1101
1102 r = -EFAULT;
1103 if (copy_from_user(&irq, argp, sizeof irq))
1104 goto out;
1105 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1106 if (r)
1107 goto out;
1108 r = 0;
1109 break;
1110 }
1111 case KVM_SET_CPUID: {
1112 struct kvm_cpuid __user *cpuid_arg = argp;
1113 struct kvm_cpuid cpuid;
803 1114
1115 r = -EFAULT;
1116 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1117 goto out;
1118 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1119 if (r)
1120 goto out;
1121 break;
1122 }
1123 case KVM_SET_CPUID2: {
1124 struct kvm_cpuid2 __user *cpuid_arg = argp;
1125 struct kvm_cpuid2 cpuid;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1129 goto out;
1130 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1131 cpuid_arg->entries);
1132 if (r)
1133 goto out;
1134 break;
1135 }
1136 case KVM_GET_CPUID2: {
1137 struct kvm_cpuid2 __user *cpuid_arg = argp;
1138 struct kvm_cpuid2 cpuid;
1139
1140 r = -EFAULT;
1141 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1142 goto out;
1143 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1144 cpuid_arg->entries);
1145 if (r)
1146 goto out;
1147 r = -EFAULT;
1148 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1149 goto out;
1150 r = 0;
1151 break;
1152 }
1153 case KVM_GET_MSRS:
1154 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1155 break;
1156 case KVM_SET_MSRS:
1157 r = msr_io(vcpu, argp, do_set_msr, 0);
1158 break;
1159 case KVM_TPR_ACCESS_REPORTING: {
1160 struct kvm_tpr_access_ctl tac;
1161
1162 r = -EFAULT;
1163 if (copy_from_user(&tac, argp, sizeof tac))
1164 goto out;
1165 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1166 if (r)
1167 goto out;
1168 r = -EFAULT;
1169 if (copy_to_user(argp, &tac, sizeof tac))
1170 goto out;
1171 r = 0;
1172 break;
1173 };
1174 case KVM_SET_VAPIC_ADDR: {
1175 struct kvm_vapic_addr va;
1176
1177 r = -EINVAL;
1178 if (!irqchip_in_kernel(vcpu->kvm))
1179 goto out;
1180 r = -EFAULT;
1181 if (copy_from_user(&va, argp, sizeof va))
1182 goto out;
1183 r = 0;
1184 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1185 break;
1186 }
1187 default:
1188 r = -EINVAL;
1189 }
804out: 1190out:
805 mutex_unlock(&kvm->lock);
806 return r; 1191 return r;
807} 1192}
808 1193
1194static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1195{
1196 int ret;
1197
1198 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1199 return -1;
1200 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1201 return ret;
1202}
1203
1204static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1205 u32 kvm_nr_mmu_pages)
1206{
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL;
1209
1210 down_write(&current->mm->mmap_sem);
1211
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214
1215 up_write(&current->mm->mmap_sem);
1216 return 0;
1217}
1218
1219static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1220{
1221 return kvm->arch.n_alloc_mmu_pages;
1222}
1223
1224gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1225{
1226 int i;
1227 struct kvm_mem_alias *alias;
1228
1229 for (i = 0; i < kvm->arch.naliases; ++i) {
1230 alias = &kvm->arch.aliases[i];
1231 if (gfn >= alias->base_gfn
1232 && gfn < alias->base_gfn + alias->npages)
1233 return alias->target_gfn + gfn - alias->base_gfn;
1234 }
1235 return gfn;
1236}
1237
809/* 1238/*
810 * Set a new alias region. Aliases map a portion of physical memory into 1239 * Set a new alias region. Aliases map a portion of physical memory into
811 * another portion. This is useful for memory windows, for example the PC 1240 * another portion. This is useful for memory windows, for example the PC
@@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
832 < alias->target_phys_addr) 1261 < alias->target_phys_addr)
833 goto out; 1262 goto out;
834 1263
835 mutex_lock(&kvm->lock); 1264 down_write(&current->mm->mmap_sem);
836 1265
837 p = &kvm->aliases[alias->slot]; 1266 p = &kvm->arch.aliases[alias->slot];
838 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
839 p->npages = alias->memory_size >> PAGE_SHIFT; 1268 p->npages = alias->memory_size >> PAGE_SHIFT;
840 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1269 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
841 1270
842 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1271 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
843 if (kvm->aliases[n - 1].npages) 1272 if (kvm->arch.aliases[n - 1].npages)
844 break; 1273 break;
845 kvm->naliases = n; 1274 kvm->arch.naliases = n;
846 1275
847 kvm_mmu_zap_all(kvm); 1276 kvm_mmu_zap_all(kvm);
848 1277
849 mutex_unlock(&kvm->lock); 1278 up_write(&current->mm->mmap_sem);
850 1279
851 return 0; 1280 return 0;
852 1281
@@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
861 r = 0; 1290 r = 0;
862 switch (chip->chip_id) { 1291 switch (chip->chip_id) {
863 case KVM_IRQCHIP_PIC_MASTER: 1292 case KVM_IRQCHIP_PIC_MASTER:
864 memcpy (&chip->chip.pic, 1293 memcpy(&chip->chip.pic,
865 &pic_irqchip(kvm)->pics[0], 1294 &pic_irqchip(kvm)->pics[0],
866 sizeof(struct kvm_pic_state)); 1295 sizeof(struct kvm_pic_state));
867 break; 1296 break;
868 case KVM_IRQCHIP_PIC_SLAVE: 1297 case KVM_IRQCHIP_PIC_SLAVE:
869 memcpy (&chip->chip.pic, 1298 memcpy(&chip->chip.pic,
870 &pic_irqchip(kvm)->pics[1], 1299 &pic_irqchip(kvm)->pics[1],
871 sizeof(struct kvm_pic_state)); 1300 sizeof(struct kvm_pic_state));
872 break; 1301 break;
873 case KVM_IRQCHIP_IOAPIC: 1302 case KVM_IRQCHIP_IOAPIC:
874 memcpy (&chip->chip.ioapic, 1303 memcpy(&chip->chip.ioapic,
875 ioapic_irqchip(kvm), 1304 ioapic_irqchip(kvm),
876 sizeof(struct kvm_ioapic_state)); 1305 sizeof(struct kvm_ioapic_state));
877 break; 1306 break;
@@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
889 r = 0; 1318 r = 0;
890 switch (chip->chip_id) { 1319 switch (chip->chip_id) {
891 case KVM_IRQCHIP_PIC_MASTER: 1320 case KVM_IRQCHIP_PIC_MASTER:
892 memcpy (&pic_irqchip(kvm)->pics[0], 1321 memcpy(&pic_irqchip(kvm)->pics[0],
893 &chip->chip.pic, 1322 &chip->chip.pic,
894 sizeof(struct kvm_pic_state)); 1323 sizeof(struct kvm_pic_state));
895 break; 1324 break;
896 case KVM_IRQCHIP_PIC_SLAVE: 1325 case KVM_IRQCHIP_PIC_SLAVE:
897 memcpy (&pic_irqchip(kvm)->pics[1], 1326 memcpy(&pic_irqchip(kvm)->pics[1],
898 &chip->chip.pic, 1327 &chip->chip.pic,
899 sizeof(struct kvm_pic_state)); 1328 sizeof(struct kvm_pic_state));
900 break; 1329 break;
901 case KVM_IRQCHIP_IOAPIC: 1330 case KVM_IRQCHIP_IOAPIC:
902 memcpy (ioapic_irqchip(kvm), 1331 memcpy(ioapic_irqchip(kvm),
903 &chip->chip.ioapic, 1332 &chip->chip.ioapic,
904 sizeof(struct kvm_ioapic_state)); 1333 sizeof(struct kvm_ioapic_state));
905 break; 1334 break;
@@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
911 return r; 1340 return r;
912} 1341}
913 1342
914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1343/*
1344 * Get (and clear) the dirty memory log for a memory slot.
1345 */
1346int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1347 struct kvm_dirty_log *log)
915{ 1348{
916 int i; 1349 int r;
917 struct kvm_mem_alias *alias; 1350 int n;
918 1351 struct kvm_memory_slot *memslot;
919 for (i = 0; i < kvm->naliases; ++i) { 1352 int is_dirty = 0;
920 alias = &kvm->aliases[i];
921 if (gfn >= alias->base_gfn
922 && gfn < alias->base_gfn + alias->npages)
923 return alias->target_gfn + gfn - alias->base_gfn;
924 }
925 return gfn;
926}
927 1353
928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1354 down_write(&current->mm->mmap_sem);
929{
930 int i;
931 1355
932 for (i = 0; i < kvm->nmemslots; ++i) { 1356 r = kvm_get_dirty_log(kvm, log, &is_dirty);
933 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1357 if (r)
1358 goto out;
934 1359
935 if (gfn >= memslot->base_gfn 1360 /* If nothing is dirty, don't bother messing with page tables. */
936 && gfn < memslot->base_gfn + memslot->npages) 1361 if (is_dirty) {
937 return memslot; 1362 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1363 kvm_flush_remote_tlbs(kvm);
1364 memslot = &kvm->memslots[log->slot];
1365 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1366 memset(memslot->dirty_bitmap, 0, n);
938 } 1367 }
939 return NULL; 1368 r = 0;
940} 1369out:
941 1370 up_write(&current->mm->mmap_sem);
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1371 return r;
943{
944 gfn = unalias_gfn(kvm, gfn);
945 return __gfn_to_memslot(kvm, gfn);
946}
947
948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
949{
950 struct kvm_memory_slot *slot;
951
952 gfn = unalias_gfn(kvm, gfn);
953 slot = __gfn_to_memslot(kvm, gfn);
954 if (!slot)
955 return NULL;
956 return slot->phys_mem[gfn - slot->base_gfn];
957} 1372}
958EXPORT_SYMBOL_GPL(gfn_to_page);
959 1373
960/* WARNING: Does not work on aliased pages. */ 1374long kvm_arch_vm_ioctl(struct file *filp,
961void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1375 unsigned int ioctl, unsigned long arg)
962{ 1376{
963 struct kvm_memory_slot *memslot; 1377 struct kvm *kvm = filp->private_data;
1378 void __user *argp = (void __user *)arg;
1379 int r = -EINVAL;
964 1380
965 memslot = __gfn_to_memslot(kvm, gfn); 1381 switch (ioctl) {
966 if (memslot && memslot->dirty_bitmap) { 1382 case KVM_SET_TSS_ADDR:
967 unsigned long rel_gfn = gfn - memslot->base_gfn; 1383 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1384 if (r < 0)
1385 goto out;
1386 break;
1387 case KVM_SET_MEMORY_REGION: {
1388 struct kvm_memory_region kvm_mem;
1389 struct kvm_userspace_memory_region kvm_userspace_mem;
968 1390
969 /* avoid RMW */ 1391 r = -EFAULT;
970 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1392 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
971 set_bit(rel_gfn, memslot->dirty_bitmap); 1393 goto out;
1394 kvm_userspace_mem.slot = kvm_mem.slot;
1395 kvm_userspace_mem.flags = kvm_mem.flags;
1396 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1397 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1398 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1399 if (r)
1400 goto out;
1401 break;
972 } 1402 }
973} 1403 case KVM_SET_NR_MMU_PAGES:
1404 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1405 if (r)
1406 goto out;
1407 break;
1408 case KVM_GET_NR_MMU_PAGES:
1409 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1410 break;
1411 case KVM_SET_MEMORY_ALIAS: {
1412 struct kvm_memory_alias alias;
974 1413
975int emulator_read_std(unsigned long addr, 1414 r = -EFAULT;
976 void *val, 1415 if (copy_from_user(&alias, argp, sizeof alias))
977 unsigned int bytes, 1416 goto out;
978 struct kvm_vcpu *vcpu) 1417 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
979{ 1418 if (r)
980 void *data = val; 1419 goto out;
1420 break;
1421 }
1422 case KVM_CREATE_IRQCHIP:
1423 r = -ENOMEM;
1424 kvm->arch.vpic = kvm_create_pic(kvm);
1425 if (kvm->arch.vpic) {
1426 r = kvm_ioapic_init(kvm);
1427 if (r) {
1428 kfree(kvm->arch.vpic);
1429 kvm->arch.vpic = NULL;
1430 goto out;
1431 }
1432 } else
1433 goto out;
1434 break;
1435 case KVM_IRQ_LINE: {
1436 struct kvm_irq_level irq_event;
981 1437
982 while (bytes) { 1438 r = -EFAULT;
983 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1439 if (copy_from_user(&irq_event, argp, sizeof irq_event))
984 unsigned offset = addr & (PAGE_SIZE-1); 1440 goto out;
985 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1441 if (irqchip_in_kernel(kvm)) {
986 unsigned long pfn; 1442 mutex_lock(&kvm->lock);
987 struct page *page; 1443 if (irq_event.irq < 16)
988 void *page_virt; 1444 kvm_pic_set_irq(pic_irqchip(kvm),
1445 irq_event.irq,
1446 irq_event.level);
1447 kvm_ioapic_set_irq(kvm->arch.vioapic,
1448 irq_event.irq,
1449 irq_event.level);
1450 mutex_unlock(&kvm->lock);
1451 r = 0;
1452 }
1453 break;
1454 }
1455 case KVM_GET_IRQCHIP: {
1456 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1457 struct kvm_irqchip chip;
989 1458
990 if (gpa == UNMAPPED_GVA) 1459 r = -EFAULT;
991 return X86EMUL_PROPAGATE_FAULT; 1460 if (copy_from_user(&chip, argp, sizeof chip))
992 pfn = gpa >> PAGE_SHIFT; 1461 goto out;
993 page = gfn_to_page(vcpu->kvm, pfn); 1462 r = -ENXIO;
994 if (!page) 1463 if (!irqchip_in_kernel(kvm))
995 return X86EMUL_UNHANDLEABLE; 1464 goto out;
996 page_virt = kmap_atomic(page, KM_USER0); 1465 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1466 if (r)
1467 goto out;
1468 r = -EFAULT;
1469 if (copy_to_user(argp, &chip, sizeof chip))
1470 goto out;
1471 r = 0;
1472 break;
1473 }
1474 case KVM_SET_IRQCHIP: {
1475 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1476 struct kvm_irqchip chip;
997 1477
998 memcpy(data, page_virt + offset, tocopy); 1478 r = -EFAULT;
1479 if (copy_from_user(&chip, argp, sizeof chip))
1480 goto out;
1481 r = -ENXIO;
1482 if (!irqchip_in_kernel(kvm))
1483 goto out;
1484 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1485 if (r)
1486 goto out;
1487 r = 0;
1488 break;
1489 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
999 1493
1000 kunmap_atomic(page_virt, KM_USER0); 1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1001 1501
1002 bytes -= tocopy; 1502 r = -EFAULT;
1003 data += tocopy; 1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1004 addr += tocopy; 1504 goto out;
1505 r = 0;
1506 break;
1005 } 1507 }
1006 1508 default:
1007 return X86EMUL_CONTINUE; 1509 ;
1510 }
1511out:
1512 return r;
1008} 1513}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010 1514
1011static int emulator_write_std(unsigned long addr, 1515static void kvm_init_msr_list(void)
1012 const void *val,
1013 unsigned int bytes,
1014 struct kvm_vcpu *vcpu)
1015{ 1516{
1016 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); 1517 u32 dummy[2];
1017 return X86EMUL_UNHANDLEABLE; 1518 unsigned i, j;
1519
1520 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1521 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1522 continue;
1523 if (j < i)
1524 msrs_to_save[j] = msrs_to_save[i];
1525 j++;
1526 }
1527 num_msrs_to_save = j;
1018} 1528}
1019 1529
1020/* 1530/*
@@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1025{ 1535{
1026 struct kvm_io_device *dev; 1536 struct kvm_io_device *dev;
1027 1537
1028 if (vcpu->apic) { 1538 if (vcpu->arch.apic) {
1029 dev = &vcpu->apic->dev; 1539 dev = &vcpu->arch.apic->dev;
1030 if (dev->in_range(dev, addr)) 1540 if (dev->in_range(dev, addr))
1031 return dev; 1541 return dev;
1032 } 1542 }
1033 return NULL; 1543 return NULL;
1034} 1544}
1035 1545
1546
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1547static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037 gpa_t addr) 1548 gpa_t addr)
1038{ 1549{
@@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1044 return dev; 1555 return dev;
1045} 1556}
1046 1557
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 1558int emulator_read_std(unsigned long addr,
1048 gpa_t addr) 1559 void *val,
1560 unsigned int bytes,
1561 struct kvm_vcpu *vcpu)
1049{ 1562{
1050 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 1563 void *data = val;
1564 int r = X86EMUL_CONTINUE;
1565
1566 down_read(&current->mm->mmap_sem);
1567 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1);
1570 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1571 int ret;
1572
1573 if (gpa == UNMAPPED_GVA) {
1574 r = X86EMUL_PROPAGATE_FAULT;
1575 goto out;
1576 }
1577 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1578 if (ret < 0) {
1579 r = X86EMUL_UNHANDLEABLE;
1580 goto out;
1581 }
1582
1583 bytes -= tocopy;
1584 data += tocopy;
1585 addr += tocopy;
1586 }
1587out:
1588 up_read(&current->mm->mmap_sem);
1589 return r;
1051} 1590}
1591EXPORT_SYMBOL_GPL(emulator_read_std);
1052 1592
1053static int emulator_read_emulated(unsigned long addr, 1593static int emulator_read_emulated(unsigned long addr,
1054 void *val, 1594 void *val,
@@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr,
1062 memcpy(val, vcpu->mmio_data, bytes); 1602 memcpy(val, vcpu->mmio_data, bytes);
1063 vcpu->mmio_read_completed = 0; 1603 vcpu->mmio_read_completed = 0;
1064 return X86EMUL_CONTINUE; 1604 return X86EMUL_CONTINUE;
1065 } else if (emulator_read_std(addr, val, bytes, vcpu) 1605 }
1066 == X86EMUL_CONTINUE) 1606
1067 return X86EMUL_CONTINUE; 1607 down_read(&current->mm->mmap_sem);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem);
1610
1611 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1613 goto mmio;
1068 1614
1069 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1615 if (emulator_read_std(addr, val, bytes, vcpu)
1616 == X86EMUL_CONTINUE)
1617 return X86EMUL_CONTINUE;
1070 if (gpa == UNMAPPED_GVA) 1618 if (gpa == UNMAPPED_GVA)
1071 return X86EMUL_PROPAGATE_FAULT; 1619 return X86EMUL_PROPAGATE_FAULT;
1072 1620
1621mmio:
1073 /* 1622 /*
1074 * Is this MMIO handled locally? 1623 * Is this MMIO handled locally?
1075 */ 1624 */
1625 mutex_lock(&vcpu->kvm->lock);
1076 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1626 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077 if (mmio_dev) { 1627 if (mmio_dev) {
1078 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1628 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1629 mutex_unlock(&vcpu->kvm->lock);
1079 return X86EMUL_CONTINUE; 1630 return X86EMUL_CONTINUE;
1080 } 1631 }
1632 mutex_unlock(&vcpu->kvm->lock);
1081 1633
1082 vcpu->mmio_needed = 1; 1634 vcpu->mmio_needed = 1;
1083 vcpu->mmio_phys_addr = gpa; 1635 vcpu->mmio_phys_addr = gpa;
@@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr,
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1642static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091 const void *val, int bytes) 1643 const void *val, int bytes)
1092{ 1644{
1093 struct page *page; 1645 int ret;
1094 void *virt;
1095 1646
1096 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1647 down_read(&current->mm->mmap_sem);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem);
1097 return 0; 1651 return 0;
1098 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1652 }
1099 if (!page)
1100 return 0;
1101 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102 virt = kmap_atomic(page, KM_USER0);
1103 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104 memcpy(virt + offset_in_page(gpa), val, bytes); 1654 up_read(&current->mm->mmap_sem);
1105 kunmap_atomic(virt, KM_USER0);
1106 return 1; 1655 return 1;
1107} 1656}
1108 1657
@@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1112 struct kvm_vcpu *vcpu) 1661 struct kvm_vcpu *vcpu)
1113{ 1662{
1114 struct kvm_io_device *mmio_dev; 1663 struct kvm_io_device *mmio_dev;
1115 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1664 gpa_t gpa;
1665
1666 down_read(&current->mm->mmap_sem);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem);
1116 1669
1117 if (gpa == UNMAPPED_GVA) { 1670 if (gpa == UNMAPPED_GVA) {
1118 kvm_x86_ops->inject_page_fault(vcpu, addr, 2); 1671 kvm_inject_page_fault(vcpu, addr, 2);
1119 return X86EMUL_PROPAGATE_FAULT; 1672 return X86EMUL_PROPAGATE_FAULT;
1120 } 1673 }
1121 1674
1675 /* For APIC access vmexit */
1676 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1677 goto mmio;
1678
1122 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1679 if (emulator_write_phys(vcpu, gpa, val, bytes))
1123 return X86EMUL_CONTINUE; 1680 return X86EMUL_CONTINUE;
1124 1681
1682mmio:
1125 /* 1683 /*
1126 * Is this MMIO handled locally? 1684 * Is this MMIO handled locally?
1127 */ 1685 */
1686 mutex_lock(&vcpu->kvm->lock);
1128 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1687 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129 if (mmio_dev) { 1688 if (mmio_dev) {
1130 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1689 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1690 mutex_unlock(&vcpu->kvm->lock);
1131 return X86EMUL_CONTINUE; 1691 return X86EMUL_CONTINUE;
1132 } 1692 }
1693 mutex_unlock(&vcpu->kvm->lock);
1133 1694
1134 vcpu->mmio_needed = 1; 1695 vcpu->mmio_needed = 1;
1135 vcpu->mmio_phys_addr = gpa; 1696 vcpu->mmio_phys_addr = gpa;
@@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1173 reported = 1; 1734 reported = 1;
1174 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1735 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175 } 1736 }
1737#ifndef CONFIG_X86_64
1738 /* guests cmpxchg8b have to be emulated atomically */
1739 if (bytes == 8) {
1740 gpa_t gpa;
1741 struct page *page;
1742 char *addr;
1743 u64 val;
1744
1745 down_read(&current->mm->mmap_sem);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747
1748 if (gpa == UNMAPPED_GVA ||
1749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1750 goto emul_write;
1751
1752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1753 goto emul_write;
1754
1755 val = *(u64 *)new;
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1757 addr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1759 kunmap_atomic(addr, KM_USER0);
1760 kvm_release_page_dirty(page);
1761 emul_write:
1762 up_read(&current->mm->mmap_sem);
1763 }
1764#endif
1765
1176 return emulator_write_emulated(addr, new, bytes, vcpu); 1766 return emulator_write_emulated(addr, new, bytes, vcpu);
1177} 1767}
1178 1768
@@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1188 1778
1189int emulate_clts(struct kvm_vcpu *vcpu) 1779int emulate_clts(struct kvm_vcpu *vcpu)
1190{ 1780{
1191 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); 1781 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1192 return X86EMUL_CONTINUE; 1782 return X86EMUL_CONTINUE;
1193} 1783}
1194 1784
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) 1785int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1196{ 1786{
1197 struct kvm_vcpu *vcpu = ctxt->vcpu; 1787 struct kvm_vcpu *vcpu = ctxt->vcpu;
1198 1788
@@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{ 1813{
1224 static int reported; 1814 static int reported;
1225 u8 opcodes[4]; 1815 u8 opcodes[4];
1226 unsigned long rip = vcpu->rip; 1816 unsigned long rip = vcpu->arch.rip;
1227 unsigned long rip_linear; 1817 unsigned long rip_linear;
1228 1818
1229 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 1819 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
@@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241 1831
1242struct x86_emulate_ops emulate_ops = { 1832struct x86_emulate_ops emulate_ops = {
1243 .read_std = emulator_read_std, 1833 .read_std = emulator_read_std,
1244 .write_std = emulator_write_std,
1245 .read_emulated = emulator_read_emulated, 1834 .read_emulated = emulator_read_emulated,
1246 .write_emulated = emulator_write_emulated, 1835 .write_emulated = emulator_write_emulated,
1247 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1836 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = {
1250int emulate_instruction(struct kvm_vcpu *vcpu, 1839int emulate_instruction(struct kvm_vcpu *vcpu,
1251 struct kvm_run *run, 1840 struct kvm_run *run,
1252 unsigned long cr2, 1841 unsigned long cr2,
1253 u16 error_code) 1842 u16 error_code,
1843 int emulation_type)
1254{ 1844{
1255 struct x86_emulate_ctxt emulate_ctxt;
1256 int r; 1845 int r;
1257 int cs_db, cs_l; 1846 struct decode_cache *c;
1258 1847
1259 vcpu->mmio_fault_cr2 = cr2; 1848 vcpu->arch.mmio_fault_cr2 = cr2;
1260 kvm_x86_ops->cache_regs(vcpu); 1849 kvm_x86_ops->cache_regs(vcpu);
1261 1850
1262 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1851 vcpu->mmio_is_write = 0;
1263 1852 vcpu->arch.pio.string = 0;
1264 emulate_ctxt.vcpu = vcpu; 1853
1265 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 1854 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1266 emulate_ctxt.cr2 = cr2; 1855 int cs_db, cs_l;
1267 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1268 ? X86EMUL_MODE_REAL : cs_l 1857
1269 ? X86EMUL_MODE_PROT64 : cs_db 1858 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1270 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 1859 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1271 1860 vcpu->arch.emulate_ctxt.mode =
1272 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 1861 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1273 emulate_ctxt.cs_base = 0; 1862 ? X86EMUL_MODE_REAL : cs_l
1274 emulate_ctxt.ds_base = 0; 1863 ? X86EMUL_MODE_PROT64 : cs_db
1275 emulate_ctxt.es_base = 0; 1864 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1276 emulate_ctxt.ss_base = 0; 1865
1277 } else { 1866 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1278 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); 1867 vcpu->arch.emulate_ctxt.cs_base = 0;
1279 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); 1868 vcpu->arch.emulate_ctxt.ds_base = 0;
1280 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); 1869 vcpu->arch.emulate_ctxt.es_base = 0;
1281 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); 1870 vcpu->arch.emulate_ctxt.ss_base = 0;
1871 } else {
1872 vcpu->arch.emulate_ctxt.cs_base =
1873 get_segment_base(vcpu, VCPU_SREG_CS);
1874 vcpu->arch.emulate_ctxt.ds_base =
1875 get_segment_base(vcpu, VCPU_SREG_DS);
1876 vcpu->arch.emulate_ctxt.es_base =
1877 get_segment_base(vcpu, VCPU_SREG_ES);
1878 vcpu->arch.emulate_ctxt.ss_base =
1879 get_segment_base(vcpu, VCPU_SREG_SS);
1880 }
1881
1882 vcpu->arch.emulate_ctxt.gs_base =
1883 get_segment_base(vcpu, VCPU_SREG_GS);
1884 vcpu->arch.emulate_ctxt.fs_base =
1885 get_segment_base(vcpu, VCPU_SREG_FS);
1886
1887 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1888
1889 /* Reject the instructions other than VMCALL/VMMCALL when
1890 * try to emulate invalid opcode */
1891 c = &vcpu->arch.emulate_ctxt.decode;
1892 if ((emulation_type & EMULTYPE_TRAP_UD) &&
1893 (!(c->twobyte && c->b == 0x01 &&
1894 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
1895 c->modrm_mod == 3 && c->modrm_rm == 1)))
1896 return EMULATE_FAIL;
1897
1898 ++vcpu->stat.insn_emulation;
1899 if (r) {
1900 ++vcpu->stat.insn_emulation_fail;
1901 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1902 return EMULATE_DONE;
1903 return EMULATE_FAIL;
1904 }
1282 } 1905 }
1283 1906
1284 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); 1907 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1285 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286 1908
1287 vcpu->mmio_is_write = 0; 1909 if (vcpu->arch.pio.string)
1288 vcpu->pio.string = 0;
1289 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290 if (vcpu->pio.string)
1291 return EMULATE_DO_MMIO; 1910 return EMULATE_DO_MMIO;
1292 1911
1293 if ((r || vcpu->mmio_is_write) && run) { 1912 if ((r || vcpu->mmio_is_write) && run) {
@@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1309 } 1928 }
1310 1929
1311 kvm_x86_ops->decache_regs(vcpu); 1930 kvm_x86_ops->decache_regs(vcpu);
1312 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1931 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1313 1932
1314 if (vcpu->mmio_is_write) { 1933 if (vcpu->mmio_is_write) {
1315 vcpu->mmio_needed = 0; 1934 vcpu->mmio_needed = 0;
@@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1320} 1939}
1321EXPORT_SYMBOL_GPL(emulate_instruction); 1940EXPORT_SYMBOL_GPL(emulate_instruction);
1322 1941
1323/* 1942static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328 DECLARE_WAITQUEUE(wait, current);
1329
1330 add_wait_queue(&vcpu->wq, &wait);
1331
1332 /*
1333 * We will block until either an interrupt or a signal wakes us up
1334 */
1335 while (!kvm_cpu_has_interrupt(vcpu)
1336 && !signal_pending(current)
1337 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 vcpu_put(vcpu);
1341 schedule();
1342 vcpu_load(vcpu);
1343 }
1344
1345 __set_current_state(TASK_RUNNING);
1346 remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351 ++vcpu->stat.halt_exits;
1352 if (irqchip_in_kernel(vcpu->kvm)) {
1353 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354 kvm_vcpu_block(vcpu);
1355 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356 return -EINTR;
1357 return 1;
1358 } else {
1359 vcpu->run->exit_reason = KVM_EXIT_HLT;
1360 return 0;
1361 }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369 kvm_x86_ops->cache_regs(vcpu);
1370 ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372 if (is_long_mode(vcpu)) {
1373 nr = vcpu->regs[VCPU_REGS_RAX];
1374 a0 = vcpu->regs[VCPU_REGS_RDI];
1375 a1 = vcpu->regs[VCPU_REGS_RSI];
1376 a2 = vcpu->regs[VCPU_REGS_RDX];
1377 a3 = vcpu->regs[VCPU_REGS_RCX];
1378 a4 = vcpu->regs[VCPU_REGS_R8];
1379 a5 = vcpu->regs[VCPU_REGS_R9];
1380 } else
1381#endif
1382 {
1383 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390 }
1391 switch (nr) {
1392 default:
1393 run->hypercall.nr = nr;
1394 run->hypercall.args[0] = a0;
1395 run->hypercall.args[1] = a1;
1396 run->hypercall.args[2] = a2;
1397 run->hypercall.args[3] = a3;
1398 run->hypercall.args[4] = a4;
1399 run->hypercall.args[5] = a5;
1400 run->hypercall.ret = ret;
1401 run->hypercall.longmode = is_long_mode(vcpu);
1402 kvm_x86_ops->decache_regs(vcpu);
1403 return 0;
1404 }
1405 vcpu->regs[VCPU_REGS_RAX] = ret;
1406 kvm_x86_ops->decache_regs(vcpu);
1407 return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418 struct descriptor_table dt = { limit, base };
1419
1420 kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425 struct descriptor_table dt = { limit, base };
1426
1427 kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431 unsigned long *rflags)
1432{
1433 lmsw(vcpu, msw);
1434 *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440 switch (cr) {
1441 case 0:
1442 return vcpu->cr0;
1443 case 2:
1444 return vcpu->cr2;
1445 case 3:
1446 return vcpu->cr3;
1447 case 4:
1448 return vcpu->cr4;
1449 default:
1450 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451 return 0;
1452 }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456 unsigned long *rflags)
1457{
1458 switch (cr) {
1459 case 0:
1460 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461 *rflags = kvm_x86_ops->get_rflags(vcpu);
1462 break;
1463 case 2:
1464 vcpu->cr2 = val;
1465 break;
1466 case 3:
1467 set_cr3(vcpu, val);
1468 break;
1469 case 4:
1470 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471 break;
1472 default:
1473 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474 }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482 struct kvm_vcpu_para_state *para_state;
1483 hpa_t para_state_hpa, hypercall_hpa;
1484 struct page *para_state_page;
1485 unsigned char *hypercall;
1486 gpa_t hypercall_gpa;
1487
1488 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491 /*
1492 * Needs to be page aligned:
1493 */
1494 if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495 goto err_gp;
1496
1497 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499 if (is_error_hpa(para_state_hpa))
1500 goto err_gp;
1501
1502 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504 para_state = kmap(para_state_page);
1505
1506 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1507 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
1508
1509 para_state->host_version = KVM_PARA_API_VERSION;
1510 /*
1511 * We cannot support guests that try to register themselves
1512 * with a newer API version than the host supports:
1513 */
1514 if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515 para_state->ret = -KVM_EINVAL;
1516 goto err_kunmap_skip;
1517 }
1518
1519 hypercall_gpa = para_state->hypercall_gpa;
1520 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522 if (is_error_hpa(hypercall_hpa)) {
1523 para_state->ret = -KVM_EINVAL;
1524 goto err_kunmap_skip;
1525 }
1526
1527 printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528 vcpu->para_state_page = para_state_page;
1529 vcpu->para_state_gpa = para_state_gpa;
1530 vcpu->hypercall_gpa = hypercall_gpa;
1531
1532 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536 kunmap_atomic(hypercall, KM_USER1);
1537
1538 para_state->ret = 0;
1539err_kunmap_skip:
1540 kunmap(para_state_page);
1541 return 0;
1542err_gp:
1543 return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548 u64 data;
1549
1550 switch (msr) {
1551 case 0xc0010010: /* SYSCFG */
1552 case 0xc0010015: /* HWCR */
1553 case MSR_IA32_PLATFORM_ID:
1554 case MSR_IA32_P5_MC_ADDR:
1555 case MSR_IA32_P5_MC_TYPE:
1556 case MSR_IA32_MC0_CTL:
1557 case MSR_IA32_MCG_STATUS:
1558 case MSR_IA32_MCG_CAP:
1559 case MSR_IA32_MC0_MISC:
1560 case MSR_IA32_MC0_MISC+4:
1561 case MSR_IA32_MC0_MISC+8:
1562 case MSR_IA32_MC0_MISC+12:
1563 case MSR_IA32_MC0_MISC+16:
1564 case MSR_IA32_UCODE_REV:
1565 case MSR_IA32_PERF_STATUS:
1566 case MSR_IA32_EBL_CR_POWERON:
1567 /* MTRR registers */
1568 case 0xfe:
1569 case 0x200 ... 0x2ff:
1570 data = 0;
1571 break;
1572 case 0xcd: /* fsb frequency */
1573 data = 3;
1574 break;
1575 case MSR_IA32_APICBASE:
1576 data = kvm_get_apic_base(vcpu);
1577 break;
1578 case MSR_IA32_MISC_ENABLE:
1579 data = vcpu->ia32_misc_enable_msr;
1580 break;
1581#ifdef CONFIG_X86_64
1582 case MSR_EFER:
1583 data = vcpu->shadow_efer;
1584 break;
1585#endif
1586 default:
1587 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588 return 1;
1589 }
1590 *pdata = data;
1591 return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609 if (efer & EFER_RESERVED_BITS) {
1610 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611 efer);
1612 inject_gp(vcpu);
1613 return;
1614 }
1615
1616 if (is_paging(vcpu)
1617 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619 inject_gp(vcpu);
1620 return;
1621 }
1622
1623 kvm_x86_ops->set_efer(vcpu, efer);
1624
1625 efer &= ~EFER_LMA;
1626 efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628 vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635 switch (msr) {
1636#ifdef CONFIG_X86_64
1637 case MSR_EFER:
1638 set_efer(vcpu, data);
1639 break;
1640#endif
1641 case MSR_IA32_MC0_STATUS:
1642 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643 __FUNCTION__, data);
1644 break;
1645 case MSR_IA32_MCG_STATUS:
1646 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647 __FUNCTION__, data);
1648 break;
1649 case MSR_IA32_UCODE_REV:
1650 case MSR_IA32_UCODE_WRITE:
1651 case 0x200 ... 0x2ff: /* MTRRs */
1652 break;
1653 case MSR_IA32_APICBASE:
1654 kvm_set_apic_base(vcpu, data);
1655 break;
1656 case MSR_IA32_MISC_ENABLE:
1657 vcpu->ia32_misc_enable_msr = data;
1658 break;
1659 /*
1660 * This is the 'probe whether the host is KVM' logic:
1661 */
1662 case MSR_KVM_API_MAGIC:
1663 return vcpu_register_para(vcpu, data);
1664
1665 default:
1666 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667 return 1;
1668 }
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685 if (!need_resched())
1686 return;
1687 cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{ 1943{
1693 int i; 1944 int i;
1694 u32 function;
1695 struct kvm_cpuid_entry *e, *best;
1696 1945
1697 kvm_x86_ops->cache_regs(vcpu); 1946 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1698 function = vcpu->regs[VCPU_REGS_RAX]; 1947 if (vcpu->arch.pio.guest_pages[i]) {
1699 vcpu->regs[VCPU_REGS_RAX] = 0; 1948 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1700 vcpu->regs[VCPU_REGS_RBX] = 0; 1949 vcpu->arch.pio.guest_pages[i] = NULL;
1701 vcpu->regs[VCPU_REGS_RCX] = 0;
1702 vcpu->regs[VCPU_REGS_RDX] = 0;
1703 best = NULL;
1704 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705 e = &vcpu->cpuid_entries[i];
1706 if (e->function == function) {
1707 best = e;
1708 break;
1709 } 1950 }
1710 /*
1711 * Both basic or both extended?
1712 */
1713 if (((e->function ^ function) & 0x80000000) == 0)
1714 if (!best || e->function > best->function)
1715 best = e;
1716 }
1717 if (best) {
1718 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722 }
1723 kvm_x86_ops->decache_regs(vcpu);
1724 kvm_x86_ops->skip_emulated_instruction(vcpu);
1725} 1951}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727 1952
1728static int pio_copy_data(struct kvm_vcpu *vcpu) 1953static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{ 1954{
1730 void *p = vcpu->pio_data; 1955 void *p = vcpu->arch.pio_data;
1731 void *q; 1956 void *q;
1732 unsigned bytes; 1957 unsigned bytes;
1733 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1958 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1734 1959
1735 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1960 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736 PAGE_KERNEL); 1961 PAGE_KERNEL);
1737 if (!q) { 1962 if (!q) {
1738 free_pio_guest_pages(vcpu); 1963 free_pio_guest_pages(vcpu);
1739 return -ENOMEM; 1964 return -ENOMEM;
1740 } 1965 }
1741 q += vcpu->pio.guest_page_offset; 1966 q += vcpu->arch.pio.guest_page_offset;
1742 bytes = vcpu->pio.size * vcpu->pio.cur_count; 1967 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1743 if (vcpu->pio.in) 1968 if (vcpu->arch.pio.in)
1744 memcpy(q, p, bytes); 1969 memcpy(q, p, bytes);
1745 else 1970 else
1746 memcpy(p, q, bytes); 1971 memcpy(p, q, bytes);
1747 q -= vcpu->pio.guest_page_offset; 1972 q -= vcpu->arch.pio.guest_page_offset;
1748 vunmap(q); 1973 vunmap(q);
1749 free_pio_guest_pages(vcpu); 1974 free_pio_guest_pages(vcpu);
1750 return 0; 1975 return 0;
1751} 1976}
1752 1977
1753static int complete_pio(struct kvm_vcpu *vcpu) 1978int complete_pio(struct kvm_vcpu *vcpu)
1754{ 1979{
1755 struct kvm_pio_request *io = &vcpu->pio; 1980 struct kvm_pio_request *io = &vcpu->arch.pio;
1756 long delta; 1981 long delta;
1757 int r; 1982 int r;
1758 1983
@@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1760 1985
1761 if (!io->string) { 1986 if (!io->string) {
1762 if (io->in) 1987 if (io->in)
1763 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, 1988 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1764 io->size); 1989 io->size);
1765 } else { 1990 } else {
1766 if (io->in) { 1991 if (io->in) {
@@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1778 * The size of the register should really depend on 2003 * The size of the register should really depend on
1779 * current address size. 2004 * current address size.
1780 */ 2005 */
1781 vcpu->regs[VCPU_REGS_RCX] -= delta; 2006 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1782 } 2007 }
1783 if (io->down) 2008 if (io->down)
1784 delta = -delta; 2009 delta = -delta;
1785 delta *= io->size; 2010 delta *= io->size;
1786 if (io->in) 2011 if (io->in)
1787 vcpu->regs[VCPU_REGS_RDI] += delta; 2012 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1788 else 2013 else
1789 vcpu->regs[VCPU_REGS_RSI] += delta; 2014 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1790 } 2015 }
1791 2016
1792 kvm_x86_ops->decache_regs(vcpu); 2017 kvm_x86_ops->decache_regs(vcpu);
@@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1804 /* TODO: String I/O for in kernel device */ 2029 /* TODO: String I/O for in kernel device */
1805 2030
1806 mutex_lock(&vcpu->kvm->lock); 2031 mutex_lock(&vcpu->kvm->lock);
1807 if (vcpu->pio.in) 2032 if (vcpu->arch.pio.in)
1808 kvm_iodevice_read(pio_dev, vcpu->pio.port, 2033 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1809 vcpu->pio.size, 2034 vcpu->arch.pio.size,
1810 pd); 2035 pd);
1811 else 2036 else
1812 kvm_iodevice_write(pio_dev, vcpu->pio.port, 2037 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1813 vcpu->pio.size, 2038 vcpu->arch.pio.size,
1814 pd); 2039 pd);
1815 mutex_unlock(&vcpu->kvm->lock); 2040 mutex_unlock(&vcpu->kvm->lock);
1816} 2041}
@@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1818static void pio_string_write(struct kvm_io_device *pio_dev, 2043static void pio_string_write(struct kvm_io_device *pio_dev,
1819 struct kvm_vcpu *vcpu) 2044 struct kvm_vcpu *vcpu)
1820{ 2045{
1821 struct kvm_pio_request *io = &vcpu->pio; 2046 struct kvm_pio_request *io = &vcpu->arch.pio;
1822 void *pd = vcpu->pio_data; 2047 void *pd = vcpu->arch.pio_data;
1823 int i; 2048 int i;
1824 2049
1825 mutex_lock(&vcpu->kvm->lock); 2050 mutex_lock(&vcpu->kvm->lock);
@@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
1832 mutex_unlock(&vcpu->kvm->lock); 2057 mutex_unlock(&vcpu->kvm->lock);
1833} 2058}
1834 2059
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2060static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2061 gpa_t addr)
2062{
2063 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2064}
2065
2066int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836 int size, unsigned port) 2067 int size, unsigned port)
1837{ 2068{
1838 struct kvm_io_device *pio_dev; 2069 struct kvm_io_device *pio_dev;
1839 2070
1840 vcpu->run->exit_reason = KVM_EXIT_IO; 2071 vcpu->run->exit_reason = KVM_EXIT_IO;
1841 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2072 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842 vcpu->run->io.size = vcpu->pio.size = size; 2073 vcpu->run->io.size = vcpu->arch.pio.size = size;
1843 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2074 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; 2075 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
1845 vcpu->run->io.port = vcpu->pio.port = port; 2076 vcpu->run->io.port = vcpu->arch.pio.port = port;
1846 vcpu->pio.in = in; 2077 vcpu->arch.pio.in = in;
1847 vcpu->pio.string = 0; 2078 vcpu->arch.pio.string = 0;
1848 vcpu->pio.down = 0; 2079 vcpu->arch.pio.down = 0;
1849 vcpu->pio.guest_page_offset = 0; 2080 vcpu->arch.pio.guest_page_offset = 0;
1850 vcpu->pio.rep = 0; 2081 vcpu->arch.pio.rep = 0;
1851 2082
1852 kvm_x86_ops->cache_regs(vcpu); 2083 kvm_x86_ops->cache_regs(vcpu);
1853 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 2084 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
1854 kvm_x86_ops->decache_regs(vcpu); 2085 kvm_x86_ops->decache_regs(vcpu);
1855 2086
1856 kvm_x86_ops->skip_emulated_instruction(vcpu); 2087 kvm_x86_ops->skip_emulated_instruction(vcpu);
1857 2088
1858 pio_dev = vcpu_find_pio_dev(vcpu, port); 2089 pio_dev = vcpu_find_pio_dev(vcpu, port);
1859 if (pio_dev) { 2090 if (pio_dev) {
1860 kernel_pio(pio_dev, vcpu, vcpu->pio_data); 2091 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
1861 complete_pio(vcpu); 2092 complete_pio(vcpu);
1862 return 1; 2093 return 1;
1863 } 2094 }
@@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1877 2108
1878 vcpu->run->exit_reason = KVM_EXIT_IO; 2109 vcpu->run->exit_reason = KVM_EXIT_IO;
1879 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2110 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880 vcpu->run->io.size = vcpu->pio.size = size; 2111 vcpu->run->io.size = vcpu->arch.pio.size = size;
1881 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2112 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; 2113 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
1883 vcpu->run->io.port = vcpu->pio.port = port; 2114 vcpu->run->io.port = vcpu->arch.pio.port = port;
1884 vcpu->pio.in = in; 2115 vcpu->arch.pio.in = in;
1885 vcpu->pio.string = 1; 2116 vcpu->arch.pio.string = 1;
1886 vcpu->pio.down = down; 2117 vcpu->arch.pio.down = down;
1887 vcpu->pio.guest_page_offset = offset_in_page(address); 2118 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
1888 vcpu->pio.rep = rep; 2119 vcpu->arch.pio.rep = rep;
1889 2120
1890 if (!count) { 2121 if (!count) {
1891 kvm_x86_ops->skip_emulated_instruction(vcpu); 2122 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1911 * String I/O in reverse. Yuck. Kill the guest, fix later. 2142 * String I/O in reverse. Yuck. Kill the guest, fix later.
1912 */ 2143 */
1913 pr_unimpl(vcpu, "guest string pio down\n"); 2144 pr_unimpl(vcpu, "guest string pio down\n");
1914 inject_gp(vcpu); 2145 kvm_inject_gp(vcpu, 0);
1915 return 1; 2146 return 1;
1916 } 2147 }
1917 vcpu->run->io.count = now; 2148 vcpu->run->io.count = now;
1918 vcpu->pio.cur_count = now; 2149 vcpu->arch.pio.cur_count = now;
1919 2150
1920 if (vcpu->pio.cur_count == vcpu->pio.count) 2151 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
1921 kvm_x86_ops->skip_emulated_instruction(vcpu); 2152 kvm_x86_ops->skip_emulated_instruction(vcpu);
1922 2153
1923 for (i = 0; i < nr_pages; ++i) { 2154 for (i = 0; i < nr_pages; ++i) {
1924 mutex_lock(&vcpu->kvm->lock); 2155 down_read(&current->mm->mmap_sem);
1925 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926 if (page) 2157 vcpu->arch.pio.guest_pages[i] = page;
1927 get_page(page); 2158 up_read(&current->mm->mmap_sem);
1928 vcpu->pio.guest_pages[i] = page;
1929 mutex_unlock(&vcpu->kvm->lock);
1930 if (!page) { 2159 if (!page) {
1931 inject_gp(vcpu); 2160 kvm_inject_gp(vcpu, 0);
1932 free_pio_guest_pages(vcpu); 2161 free_pio_guest_pages(vcpu);
1933 return 1; 2162 return 1;
1934 } 2163 }
1935 } 2164 }
1936 2165
1937 pio_dev = vcpu_find_pio_dev(vcpu, port); 2166 pio_dev = vcpu_find_pio_dev(vcpu, port);
1938 if (!vcpu->pio.in) { 2167 if (!vcpu->arch.pio.in) {
1939 /* string PIO write */ 2168 /* string PIO write */
1940 ret = pio_copy_data(vcpu); 2169 ret = pio_copy_data(vcpu);
1941 if (ret >= 0 && pio_dev) { 2170 if (ret >= 0 && pio_dev) {
1942 pio_string_write(pio_dev, vcpu); 2171 pio_string_write(pio_dev, vcpu);
1943 complete_pio(vcpu); 2172 complete_pio(vcpu);
1944 if (vcpu->pio.count == 0) 2173 if (vcpu->arch.pio.count == 0)
1945 ret = 1; 2174 ret = 1;
1946 } 2175 }
1947 } else if (pio_dev) 2176 } else if (pio_dev)
@@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1953} 2182}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2183EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955 2184
2185int kvm_arch_init(void *opaque)
2186{
2187 int r;
2188 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2189
2190 if (kvm_x86_ops) {
2191 printk(KERN_ERR "kvm: already loaded the other module\n");
2192 r = -EEXIST;
2193 goto out;
2194 }
2195
2196 if (!ops->cpu_has_kvm_support()) {
2197 printk(KERN_ERR "kvm: no hardware support\n");
2198 r = -EOPNOTSUPP;
2199 goto out;
2200 }
2201 if (ops->disabled_by_bios()) {
2202 printk(KERN_ERR "kvm: disabled by bios\n");
2203 r = -EOPNOTSUPP;
2204 goto out;
2205 }
2206
2207 r = kvm_mmu_module_init();
2208 if (r)
2209 goto out;
2210
2211 kvm_init_msr_list();
2212
2213 kvm_x86_ops = ops;
2214 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2215 return 0;
2216
2217out:
2218 return r;
2219}
2220
2221void kvm_arch_exit(void)
2222{
2223 kvm_x86_ops = NULL;
2224 kvm_mmu_module_exit();
2225}
2226
2227int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2228{
2229 ++vcpu->stat.halt_exits;
2230 if (irqchip_in_kernel(vcpu->kvm)) {
2231 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2232 kvm_vcpu_block(vcpu);
2233 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2234 return -EINTR;
2235 return 1;
2236 } else {
2237 vcpu->run->exit_reason = KVM_EXIT_HLT;
2238 return 0;
2239 }
2240}
2241EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2242
2243int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2244{
2245 unsigned long nr, a0, a1, a2, a3, ret;
2246
2247 kvm_x86_ops->cache_regs(vcpu);
2248
2249 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2250 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2251 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2252 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2253 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2254
2255 if (!is_long_mode(vcpu)) {
2256 nr &= 0xFFFFFFFF;
2257 a0 &= 0xFFFFFFFF;
2258 a1 &= 0xFFFFFFFF;
2259 a2 &= 0xFFFFFFFF;
2260 a3 &= 0xFFFFFFFF;
2261 }
2262
2263 switch (nr) {
2264 case KVM_HC_VAPIC_POLL_IRQ:
2265 ret = 0;
2266 break;
2267 default:
2268 ret = -KVM_ENOSYS;
2269 break;
2270 }
2271 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2272 kvm_x86_ops->decache_regs(vcpu);
2273 return 0;
2274}
2275EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2276
2277int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2278{
2279 char instruction[3];
2280 int ret = 0;
2281
2282
2283 /*
2284 * Blow out the MMU to ensure that no other VCPU has an active mapping
2285 * to ensure that the updated hypercall appears atomically across all
2286 * VCPUs.
2287 */
2288 kvm_mmu_zap_all(vcpu->kvm);
2289
2290 kvm_x86_ops->cache_regs(vcpu);
2291 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2292 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2293 != X86EMUL_CONTINUE)
2294 ret = -EFAULT;
2295
2296 return ret;
2297}
2298
2299static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2300{
2301 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2302}
2303
2304void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2305{
2306 struct descriptor_table dt = { limit, base };
2307
2308 kvm_x86_ops->set_gdt(vcpu, &dt);
2309}
2310
2311void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2312{
2313 struct descriptor_table dt = { limit, base };
2314
2315 kvm_x86_ops->set_idt(vcpu, &dt);
2316}
2317
2318void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2319 unsigned long *rflags)
2320{
2321 lmsw(vcpu, msw);
2322 *rflags = kvm_x86_ops->get_rflags(vcpu);
2323}
2324
2325unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2326{
2327 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2328 switch (cr) {
2329 case 0:
2330 return vcpu->arch.cr0;
2331 case 2:
2332 return vcpu->arch.cr2;
2333 case 3:
2334 return vcpu->arch.cr3;
2335 case 4:
2336 return vcpu->arch.cr4;
2337 case 8:
2338 return get_cr8(vcpu);
2339 default:
2340 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2341 return 0;
2342 }
2343}
2344
2345void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2346 unsigned long *rflags)
2347{
2348 switch (cr) {
2349 case 0:
2350 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2351 *rflags = kvm_x86_ops->get_rflags(vcpu);
2352 break;
2353 case 2:
2354 vcpu->arch.cr2 = val;
2355 break;
2356 case 3:
2357 set_cr3(vcpu, val);
2358 break;
2359 case 4:
2360 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2361 break;
2362 case 8:
2363 set_cr8(vcpu, val & 0xfUL);
2364 break;
2365 default:
2366 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2367 }
2368}
2369
2370static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2371{
2372 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2373 int j, nent = vcpu->arch.cpuid_nent;
2374
2375 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2376 /* when no next entry is found, the current entry[i] is reselected */
2377 for (j = i + 1; j == i; j = (j + 1) % nent) {
2378 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2379 if (ej->function == e->function) {
2380 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2381 return j;
2382 }
2383 }
2384 return 0; /* silence gcc, even though control never reaches here */
2385}
2386
2387/* find an entry with matching function, matching index (if needed), and that
2388 * should be read next (if it's stateful) */
2389static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2390 u32 function, u32 index)
2391{
2392 if (e->function != function)
2393 return 0;
2394 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2395 return 0;
2396 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2397 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2398 return 0;
2399 return 1;
2400}
2401
2402void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2403{
2404 int i;
2405 u32 function, index;
2406 struct kvm_cpuid_entry2 *e, *best;
2407
2408 kvm_x86_ops->cache_regs(vcpu);
2409 function = vcpu->arch.regs[VCPU_REGS_RAX];
2410 index = vcpu->arch.regs[VCPU_REGS_RCX];
2411 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2412 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2413 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2414 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2415 best = NULL;
2416 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2417 e = &vcpu->arch.cpuid_entries[i];
2418 if (is_matching_cpuid_entry(e, function, index)) {
2419 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2420 move_to_next_stateful_cpuid_entry(vcpu, i);
2421 best = e;
2422 break;
2423 }
2424 /*
2425 * Both basic or both extended?
2426 */
2427 if (((e->function ^ function) & 0x80000000) == 0)
2428 if (!best || e->function > best->function)
2429 best = e;
2430 }
2431 if (best) {
2432 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2433 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2434 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2435 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2436 }
2437 kvm_x86_ops->decache_regs(vcpu);
2438 kvm_x86_ops->skip_emulated_instruction(vcpu);
2439}
2440EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2441
1956/* 2442/*
1957 * Check if userspace requested an interrupt window, and that the 2443 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open. 2444 * interrupt window is open.
@@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2448static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963 struct kvm_run *kvm_run) 2449 struct kvm_run *kvm_run)
1964{ 2450{
1965 return (!vcpu->irq_summary && 2451 return (!vcpu->arch.irq_summary &&
1966 kvm_run->request_interrupt_window && 2452 kvm_run->request_interrupt_window &&
1967 vcpu->interrupt_window_open && 2453 vcpu->arch.interrupt_window_open &&
1968 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2454 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969} 2455}
1970 2456
@@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1978 kvm_run->ready_for_interrupt_injection = 1; 2464 kvm_run->ready_for_interrupt_injection = 1;
1979 else 2465 else
1980 kvm_run->ready_for_interrupt_injection = 2466 kvm_run->ready_for_interrupt_injection =
1981 (vcpu->interrupt_window_open && 2467 (vcpu->arch.interrupt_window_open &&
1982 vcpu->irq_summary == 0); 2468 vcpu->arch.irq_summary == 0);
2469}
2470
2471static void vapic_enter(struct kvm_vcpu *vcpu)
2472{
2473 struct kvm_lapic *apic = vcpu->arch.apic;
2474 struct page *page;
2475
2476 if (!apic || !apic->vapic_addr)
2477 return;
2478
2479 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem);
2483}
2484
2485static void vapic_exit(struct kvm_vcpu *vcpu)
2486{
2487 struct kvm_lapic *apic = vcpu->arch.apic;
2488
2489 if (!apic || !apic->vapic_addr)
2490 return;
2491
2492 kvm_release_page_dirty(apic->vapic_page);
2493 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
1983} 2494}
1984 2495
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2496static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{ 2497{
1987 int r; 2498 int r;
1988 2499
1989 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { 2500 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990 printk("vcpu %d received sipi with vector # %x\n", 2501 pr_debug("vcpu %d received sipi with vector # %x\n",
1991 vcpu->vcpu_id, vcpu->sipi_vector); 2502 vcpu->vcpu_id, vcpu->arch.sipi_vector);
1992 kvm_lapic_reset(vcpu); 2503 kvm_lapic_reset(vcpu);
1993 kvm_x86_ops->vcpu_reset(vcpu); 2504 r = kvm_x86_ops->vcpu_reset(vcpu);
1994 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 2505 if (r)
2506 return r;
2507 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
1995 } 2508 }
1996 2509
2510 vapic_enter(vcpu);
2511
1997preempted: 2512preempted:
1998 if (vcpu->guest_debug.enabled) 2513 if (vcpu->guest_debug.enabled)
1999 kvm_x86_ops->guest_debug_pre(vcpu); 2514 kvm_x86_ops->guest_debug_pre(vcpu);
@@ -2003,6 +2518,19 @@ again:
2003 if (unlikely(r)) 2518 if (unlikely(r))
2004 goto out; 2519 goto out;
2005 2520
2521 if (vcpu->requests) {
2522 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2523 __kvm_migrate_apic_timer(vcpu);
2524 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2525 &vcpu->requests)) {
2526 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2527 r = 0;
2528 goto out;
2529 }
2530 }
2531
2532 kvm_inject_pending_timer_irqs(vcpu);
2533
2006 preempt_disable(); 2534 preempt_disable();
2007 2535
2008 kvm_x86_ops->prepare_guest_switch(vcpu); 2536 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -2010,6 +2538,13 @@ again:
2010 2538
2011 local_irq_disable(); 2539 local_irq_disable();
2012 2540
2541 if (need_resched()) {
2542 local_irq_enable();
2543 preempt_enable();
2544 r = 1;
2545 goto out;
2546 }
2547
2013 if (signal_pending(current)) { 2548 if (signal_pending(current)) {
2014 local_irq_enable(); 2549 local_irq_enable();
2015 preempt_enable(); 2550 preempt_enable();
@@ -2019,16 +2554,20 @@ again:
2019 goto out; 2554 goto out;
2020 } 2555 }
2021 2556
2022 if (irqchip_in_kernel(vcpu->kvm)) 2557 if (vcpu->arch.exception.pending)
2558 __queue_exception(vcpu);
2559 else if (irqchip_in_kernel(vcpu->kvm))
2023 kvm_x86_ops->inject_pending_irq(vcpu); 2560 kvm_x86_ops->inject_pending_irq(vcpu);
2024 else if (!vcpu->mmio_read_completed) 2561 else
2025 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2562 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026 2563
2564 kvm_lapic_sync_to_vapic(vcpu);
2565
2027 vcpu->guest_mode = 1; 2566 vcpu->guest_mode = 1;
2028 kvm_guest_enter(); 2567 kvm_guest_enter();
2029 2568
2030 if (vcpu->requests) 2569 if (vcpu->requests)
2031 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) 2570 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2032 kvm_x86_ops->tlb_flush(vcpu); 2571 kvm_x86_ops->tlb_flush(vcpu);
2033 2572
2034 kvm_x86_ops->run(vcpu, kvm_run); 2573 kvm_x86_ops->run(vcpu, kvm_run);
@@ -2055,9 +2594,14 @@ again:
2055 */ 2594 */
2056 if (unlikely(prof_on == KVM_PROFILING)) { 2595 if (unlikely(prof_on == KVM_PROFILING)) {
2057 kvm_x86_ops->cache_regs(vcpu); 2596 kvm_x86_ops->cache_regs(vcpu);
2058 profile_hit(KVM_PROFILING, (void *)vcpu->rip); 2597 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2059 } 2598 }
2060 2599
2600 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2601 vcpu->arch.exception.pending = false;
2602
2603 kvm_lapic_sync_from_vapic(vcpu);
2604
2061 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2605 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062 2606
2063 if (r > 0) { 2607 if (r > 0) {
@@ -2067,10 +2611,8 @@ again:
2067 ++vcpu->stat.request_irq_exits; 2611 ++vcpu->stat.request_irq_exits;
2068 goto out; 2612 goto out;
2069 } 2613 }
2070 if (!need_resched()) { 2614 if (!need_resched())
2071 ++vcpu->stat.light_exits;
2072 goto again; 2615 goto again;
2073 }
2074 } 2616 }
2075 2617
2076out: 2618out:
@@ -2081,18 +2623,19 @@ out:
2081 2623
2082 post_kvm_run_save(vcpu, kvm_run); 2624 post_kvm_run_save(vcpu, kvm_run);
2083 2625
2626 vapic_exit(vcpu);
2627
2084 return r; 2628 return r;
2085} 2629}
2086 2630
2087 2631int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{ 2632{
2090 int r; 2633 int r;
2091 sigset_t sigsaved; 2634 sigset_t sigsaved;
2092 2635
2093 vcpu_load(vcpu); 2636 vcpu_load(vcpu);
2094 2637
2095 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { 2638 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096 kvm_vcpu_block(vcpu); 2639 kvm_vcpu_block(vcpu);
2097 vcpu_put(vcpu); 2640 vcpu_put(vcpu);
2098 return -EAGAIN; 2641 return -EAGAIN;
@@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2105 if (!irqchip_in_kernel(vcpu->kvm)) 2648 if (!irqchip_in_kernel(vcpu->kvm))
2106 set_cr8(vcpu, kvm_run->cr8); 2649 set_cr8(vcpu, kvm_run->cr8);
2107 2650
2108 if (vcpu->pio.cur_count) { 2651 if (vcpu->arch.pio.cur_count) {
2109 r = complete_pio(vcpu); 2652 r = complete_pio(vcpu);
2110 if (r) 2653 if (r)
2111 goto out; 2654 goto out;
2112 } 2655 }
2113 2656#if CONFIG_HAS_IOMEM
2114 if (vcpu->mmio_needed) { 2657 if (vcpu->mmio_needed) {
2115 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 2658 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116 vcpu->mmio_read_completed = 1; 2659 vcpu->mmio_read_completed = 1;
2117 vcpu->mmio_needed = 0; 2660 vcpu->mmio_needed = 0;
2118 r = emulate_instruction(vcpu, kvm_run, 2661 r = emulate_instruction(vcpu, kvm_run,
2119 vcpu->mmio_fault_cr2, 0); 2662 vcpu->arch.mmio_fault_cr2, 0,
2663 EMULTYPE_NO_DECODE);
2120 if (r == EMULATE_DO_MMIO) { 2664 if (r == EMULATE_DO_MMIO) {
2121 /* 2665 /*
2122 * Read-modify-write. Back to userspace. 2666 * Read-modify-write. Back to userspace.
@@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2125 goto out; 2669 goto out;
2126 } 2670 }
2127 } 2671 }
2128 2672#endif
2129 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 2673 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130 kvm_x86_ops->cache_regs(vcpu); 2674 kvm_x86_ops->cache_regs(vcpu);
2131 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 2675 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132 kvm_x86_ops->decache_regs(vcpu); 2676 kvm_x86_ops->decache_regs(vcpu);
2133 } 2677 }
2134 2678
@@ -2142,33 +2686,32 @@ out:
2142 return r; 2686 return r;
2143} 2687}
2144 2688
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 2689int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2146 struct kvm_regs *regs)
2147{ 2690{
2148 vcpu_load(vcpu); 2691 vcpu_load(vcpu);
2149 2692
2150 kvm_x86_ops->cache_regs(vcpu); 2693 kvm_x86_ops->cache_regs(vcpu);
2151 2694
2152 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 2695 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2153 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 2696 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2154 regs->rcx = vcpu->regs[VCPU_REGS_RCX]; 2697 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2155 regs->rdx = vcpu->regs[VCPU_REGS_RDX]; 2698 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2156 regs->rsi = vcpu->regs[VCPU_REGS_RSI]; 2699 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2157 regs->rdi = vcpu->regs[VCPU_REGS_RDI]; 2700 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2158 regs->rsp = vcpu->regs[VCPU_REGS_RSP]; 2701 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2159 regs->rbp = vcpu->regs[VCPU_REGS_RBP]; 2702 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64 2703#ifdef CONFIG_X86_64
2161 regs->r8 = vcpu->regs[VCPU_REGS_R8]; 2704 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2162 regs->r9 = vcpu->regs[VCPU_REGS_R9]; 2705 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2163 regs->r10 = vcpu->regs[VCPU_REGS_R10]; 2706 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2164 regs->r11 = vcpu->regs[VCPU_REGS_R11]; 2707 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2165 regs->r12 = vcpu->regs[VCPU_REGS_R12]; 2708 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2166 regs->r13 = vcpu->regs[VCPU_REGS_R13]; 2709 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2167 regs->r14 = vcpu->regs[VCPU_REGS_R14]; 2710 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2168 regs->r15 = vcpu->regs[VCPU_REGS_R15]; 2711 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2169#endif 2712#endif
2170 2713
2171 regs->rip = vcpu->rip; 2714 regs->rip = vcpu->arch.rip;
2172 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 2715 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173 2716
2174 /* 2717 /*
@@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2182 return 0; 2725 return 0;
2183} 2726}
2184 2727
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 2728int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2186 struct kvm_regs *regs)
2187{ 2729{
2188 vcpu_load(vcpu); 2730 vcpu_load(vcpu);
2189 2731
2190 vcpu->regs[VCPU_REGS_RAX] = regs->rax; 2732 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2191 vcpu->regs[VCPU_REGS_RBX] = regs->rbx; 2733 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2192 vcpu->regs[VCPU_REGS_RCX] = regs->rcx; 2734 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2193 vcpu->regs[VCPU_REGS_RDX] = regs->rdx; 2735 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2194 vcpu->regs[VCPU_REGS_RSI] = regs->rsi; 2736 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2195 vcpu->regs[VCPU_REGS_RDI] = regs->rdi; 2737 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2196 vcpu->regs[VCPU_REGS_RSP] = regs->rsp; 2738 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2197 vcpu->regs[VCPU_REGS_RBP] = regs->rbp; 2739 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64 2740#ifdef CONFIG_X86_64
2199 vcpu->regs[VCPU_REGS_R8] = regs->r8; 2741 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2200 vcpu->regs[VCPU_REGS_R9] = regs->r9; 2742 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2201 vcpu->regs[VCPU_REGS_R10] = regs->r10; 2743 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2202 vcpu->regs[VCPU_REGS_R11] = regs->r11; 2744 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2203 vcpu->regs[VCPU_REGS_R12] = regs->r12; 2745 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2204 vcpu->regs[VCPU_REGS_R13] = regs->r13; 2746 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2205 vcpu->regs[VCPU_REGS_R14] = regs->r14; 2747 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2206 vcpu->regs[VCPU_REGS_R15] = regs->r15; 2748 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2207#endif 2749#endif
2208 2750
2209 vcpu->rip = regs->rip; 2751 vcpu->arch.rip = regs->rip;
2210 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 2752 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211 2753
2212 kvm_x86_ops->decache_regs(vcpu); 2754 kvm_x86_ops->decache_regs(vcpu);
@@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu,
2222 return kvm_x86_ops->get_segment(vcpu, var, seg); 2764 return kvm_x86_ops->get_segment(vcpu, var, seg);
2223} 2765}
2224 2766
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 2767void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2226 struct kvm_sregs *sregs) 2768{
2769 struct kvm_segment cs;
2770
2771 get_segment(vcpu, &cs, VCPU_SREG_CS);
2772 *db = cs.db;
2773 *l = cs.l;
2774}
2775EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2776
2777int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2778 struct kvm_sregs *sregs)
2227{ 2779{
2228 struct descriptor_table dt; 2780 struct descriptor_table dt;
2229 int pending_vec; 2781 int pending_vec;
@@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2248 sregs->gdt.base = dt.base; 2800 sregs->gdt.base = dt.base;
2249 2801
2250 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2802 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251 sregs->cr0 = vcpu->cr0; 2803 sregs->cr0 = vcpu->arch.cr0;
2252 sregs->cr2 = vcpu->cr2; 2804 sregs->cr2 = vcpu->arch.cr2;
2253 sregs->cr3 = vcpu->cr3; 2805 sregs->cr3 = vcpu->arch.cr3;
2254 sregs->cr4 = vcpu->cr4; 2806 sregs->cr4 = vcpu->arch.cr4;
2255 sregs->cr8 = get_cr8(vcpu); 2807 sregs->cr8 = get_cr8(vcpu);
2256 sregs->efer = vcpu->shadow_efer; 2808 sregs->efer = vcpu->arch.shadow_efer;
2257 sregs->apic_base = kvm_get_apic_base(vcpu); 2809 sregs->apic_base = kvm_get_apic_base(vcpu);
2258 2810
2259 if (irqchip_in_kernel(vcpu->kvm)) { 2811 if (irqchip_in_kernel(vcpu->kvm)) {
@@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2261 sizeof sregs->interrupt_bitmap); 2813 sizeof sregs->interrupt_bitmap);
2262 pending_vec = kvm_x86_ops->get_irq(vcpu); 2814 pending_vec = kvm_x86_ops->get_irq(vcpu);
2263 if (pending_vec >= 0) 2815 if (pending_vec >= 0)
2264 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); 2816 set_bit(pending_vec,
2817 (unsigned long *)sregs->interrupt_bitmap);
2265 } else 2818 } else
2266 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 2819 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2267 sizeof sregs->interrupt_bitmap); 2820 sizeof sregs->interrupt_bitmap);
2268 2821
2269 vcpu_put(vcpu); 2822 vcpu_put(vcpu);
@@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
2277 return kvm_x86_ops->set_segment(vcpu, var, seg); 2830 return kvm_x86_ops->set_segment(vcpu, var, seg);
2278} 2831}
2279 2832
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 2833int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281 struct kvm_sregs *sregs) 2834 struct kvm_sregs *sregs)
2282{ 2835{
2283 int mmu_reset_needed = 0; 2836 int mmu_reset_needed = 0;
2284 int i, pending_vec, max_bits; 2837 int i, pending_vec, max_bits;
@@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2293 dt.base = sregs->gdt.base; 2846 dt.base = sregs->gdt.base;
2294 kvm_x86_ops->set_gdt(vcpu, &dt); 2847 kvm_x86_ops->set_gdt(vcpu, &dt);
2295 2848
2296 vcpu->cr2 = sregs->cr2; 2849 vcpu->arch.cr2 = sregs->cr2;
2297 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 2850 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2298 vcpu->cr3 = sregs->cr3; 2851 vcpu->arch.cr3 = sregs->cr3;
2299 2852
2300 set_cr8(vcpu, sregs->cr8); 2853 set_cr8(vcpu, sregs->cr8);
2301 2854
2302 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 2855 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64 2856#ifdef CONFIG_X86_64
2304 kvm_x86_ops->set_efer(vcpu, sregs->efer); 2857 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif 2858#endif
@@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2307 2860
2308 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309 2862
2310 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2311 vcpu->cr0 = sregs->cr0; 2864 vcpu->arch.cr0 = sregs->cr0;
2312 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313 2866
2314 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2315 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2869 if (!is_long_mode(vcpu) && is_pae(vcpu))
2317 load_pdptrs(vcpu, vcpu->cr3); 2870 load_pdptrs(vcpu, vcpu->arch.cr3);
2318 2871
2319 if (mmu_reset_needed) 2872 if (mmu_reset_needed)
2320 kvm_mmu_reset_context(vcpu); 2873 kvm_mmu_reset_context(vcpu);
2321 2874
2322 if (!irqchip_in_kernel(vcpu->kvm)) { 2875 if (!irqchip_in_kernel(vcpu->kvm)) {
2323 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2876 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2324 sizeof vcpu->irq_pending); 2877 sizeof vcpu->arch.irq_pending);
2325 vcpu->irq_summary = 0; 2878 vcpu->arch.irq_summary = 0;
2326 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) 2879 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2327 if (vcpu->irq_pending[i]) 2880 if (vcpu->arch.irq_pending[i])
2328 __set_bit(i, &vcpu->irq_summary); 2881 __set_bit(i, &vcpu->arch.irq_summary);
2329 } else { 2882 } else {
2330 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 2883 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331 pending_vec = find_first_bit( 2884 pending_vec = find_first_bit(
@@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2334 /* Only pending external irq is handled here */ 2887 /* Only pending external irq is handled here */
2335 if (pending_vec < max_bits) { 2888 if (pending_vec < max_bits) {
2336 kvm_x86_ops->set_irq(vcpu, pending_vec); 2889 kvm_x86_ops->set_irq(vcpu, pending_vec);
2337 printk("Set back pending irq %d\n", pending_vec); 2890 pr_debug("Set back pending irq %d\n",
2891 pending_vec);
2338 } 2892 }
2339 } 2893 }
2340 2894
@@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2353 return 0; 2907 return 0;
2354} 2908}
2355 2909
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2910int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2357{ 2911 struct kvm_debug_guest *dbg)
2358 struct kvm_segment cs;
2359
2360 get_segment(vcpu, &cs, VCPU_SREG_CS);
2361 *db = cs.db;
2362 *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375 MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379 MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385 MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390 u32 dummy[2];
2391 unsigned i, j;
2392
2393 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395 continue;
2396 if (j < i)
2397 msrs_to_save[j] = msrs_to_save[i];
2398 j++;
2399 }
2400 num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408 return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417 struct kvm_msr_entry *entries,
2418 int (*do_msr)(struct kvm_vcpu *vcpu,
2419 unsigned index, u64 *data))
2420{
2421 int i;
2422
2423 vcpu_load(vcpu);
2424
2425 for (i = 0; i < msrs->nmsrs; ++i)
2426 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427 break;
2428
2429 vcpu_put(vcpu);
2430
2431 return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440 int (*do_msr)(struct kvm_vcpu *vcpu,
2441 unsigned index, u64 *data),
2442 int writeback)
2443{
2444 struct kvm_msrs msrs;
2445 struct kvm_msr_entry *entries;
2446 int r, n;
2447 unsigned size;
2448
2449 r = -EFAULT;
2450 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451 goto out;
2452
2453 r = -E2BIG;
2454 if (msrs.nmsrs >= MAX_IO_MSRS)
2455 goto out;
2456
2457 r = -ENOMEM;
2458 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459 entries = vmalloc(size);
2460 if (!entries)
2461 goto out;
2462
2463 r = -EFAULT;
2464 if (copy_from_user(entries, user_msrs->entries, size))
2465 goto out_free;
2466
2467 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468 if (r < 0)
2469 goto out_free;
2470
2471 r = -EFAULT;
2472 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473 goto out_free;
2474
2475 r = n;
2476
2477out_free:
2478 vfree(entries);
2479out:
2480 return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487 struct kvm_translation *tr)
2488{
2489 unsigned long vaddr = tr->linear_address;
2490 gpa_t gpa;
2491
2492 vcpu_load(vcpu);
2493 mutex_lock(&vcpu->kvm->lock);
2494 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495 tr->physical_address = gpa;
2496 tr->valid = gpa != UNMAPPED_GVA;
2497 tr->writeable = 1;
2498 tr->usermode = 0;
2499 mutex_unlock(&vcpu->kvm->lock);
2500 vcpu_put(vcpu);
2501
2502 return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506 struct kvm_interrupt *irq)
2507{
2508 if (irq->irq < 0 || irq->irq >= 256)
2509 return -EINVAL;
2510 if (irqchip_in_kernel(vcpu->kvm))
2511 return -ENXIO;
2512 vcpu_load(vcpu);
2513
2514 set_bit(irq->irq, vcpu->irq_pending);
2515 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517 vcpu_put(vcpu);
2518
2519 return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523 struct kvm_debug_guest *dbg)
2524{ 2912{
2525 int r; 2913 int r;
2526 2914
@@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2533 return r; 2921 return r;
2534} 2922}
2535 2923
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537 unsigned long address,
2538 int *type)
2539{
2540 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541 unsigned long pgoff;
2542 struct page *page;
2543
2544 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545 if (pgoff == 0)
2546 page = virt_to_page(vcpu->run);
2547 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548 page = virt_to_page(vcpu->pio_data);
2549 else
2550 return NOPAGE_SIGBUS;
2551 get_page(page);
2552 if (type != NULL)
2553 *type = VM_FAULT_MINOR;
2554
2555 return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559 .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564 vma->vm_ops = &kvm_vcpu_vm_ops;
2565 return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570 struct kvm_vcpu *vcpu = filp->private_data;
2571
2572 fput(vcpu->kvm->filp);
2573 return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577 .release = kvm_vcpu_release,
2578 .unlocked_ioctl = kvm_vcpu_ioctl,
2579 .compat_ioctl = kvm_vcpu_ioctl,
2580 .mmap = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588 int fd, r;
2589 struct inode *inode;
2590 struct file *file;
2591
2592 r = anon_inode_getfd(&fd, &inode, &file,
2593 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594 if (r)
2595 return r;
2596 atomic_inc(&vcpu->kvm->filp->f_count);
2597 return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus. Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605 int r;
2606 struct kvm_vcpu *vcpu;
2607
2608 if (!valid_vcpu(n))
2609 return -EINVAL;
2610
2611 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612 if (IS_ERR(vcpu))
2613 return PTR_ERR(vcpu);
2614
2615 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617 /* We do fxsave: this must be aligned. */
2618 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620 vcpu_load(vcpu);
2621 r = kvm_mmu_setup(vcpu);
2622 vcpu_put(vcpu);
2623 if (r < 0)
2624 goto free_vcpu;
2625
2626 mutex_lock(&kvm->lock);
2627 if (kvm->vcpus[n]) {
2628 r = -EEXIST;
2629 mutex_unlock(&kvm->lock);
2630 goto mmu_unload;
2631 }
2632 kvm->vcpus[n] = vcpu;
2633 mutex_unlock(&kvm->lock);
2634
2635 /* Now it's all set up, let userspace reach it */
2636 r = create_vcpu_fd(vcpu);
2637 if (r < 0)
2638 goto unlink;
2639 return r;
2640
2641unlink:
2642 mutex_lock(&kvm->lock);
2643 kvm->vcpus[n] = NULL;
2644 mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647 vcpu_load(vcpu);
2648 kvm_mmu_unload(vcpu);
2649 vcpu_put(vcpu);
2650
2651free_vcpu:
2652 kvm_x86_ops->vcpu_free(vcpu);
2653 return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658 u64 efer;
2659 int i;
2660 struct kvm_cpuid_entry *e, *entry;
2661
2662 rdmsrl(MSR_EFER, efer);
2663 entry = NULL;
2664 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665 e = &vcpu->cpuid_entries[i];
2666 if (e->function == 0x80000001) {
2667 entry = e;
2668 break;
2669 }
2670 }
2671 if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672 entry->edx &= ~(1 << 20);
2673 printk(KERN_INFO "kvm: guest NX capability removed\n");
2674 }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678 struct kvm_cpuid *cpuid,
2679 struct kvm_cpuid_entry __user *entries)
2680{
2681 int r;
2682
2683 r = -E2BIG;
2684 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685 goto out;
2686 r = -EFAULT;
2687 if (copy_from_user(&vcpu->cpuid_entries, entries,
2688 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689 goto out;
2690 vcpu->cpuid_nent = cpuid->nent;
2691 cpuid_fix_nx_cap(vcpu);
2692 return 0;
2693
2694out:
2695 return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700 if (sigset) {
2701 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702 vcpu->sigset_active = 1;
2703 vcpu->sigset = *sigset;
2704 } else
2705 vcpu->sigset_active = 0;
2706 return 0;
2707}
2708
2709/* 2924/*
2710 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 2925 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2711 * we have asm/x86/processor.h 2926 * we have asm/x86/processor.h
@@ -2727,9 +2942,31 @@ struct fxsave {
2727#endif 2942#endif
2728}; 2943};
2729 2944
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2945/*
2946 * Translate a guest virtual address to a guest physical address.
2947 */
2948int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2949 struct kvm_translation *tr)
2731{ 2950{
2732 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2951 unsigned long vaddr = tr->linear_address;
2952 gpa_t gpa;
2953
2954 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem);
2958 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1;
2961 tr->usermode = 0;
2962 vcpu_put(vcpu);
2963
2964 return 0;
2965}
2966
2967int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2968{
2969 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2733 2970
2734 vcpu_load(vcpu); 2971 vcpu_load(vcpu);
2735 2972
@@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2747 return 0; 2984 return 0;
2748} 2985}
2749 2986
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2987int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{ 2988{
2752 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2989 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2753 2990
2754 vcpu_load(vcpu); 2991 vcpu_load(vcpu);
2755 2992
@@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2767 return 0; 3004 return 0;
2768} 3005}
2769 3006
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 3007void fx_init(struct kvm_vcpu *vcpu)
2771 struct kvm_lapic_state *s)
2772{ 3008{
2773 vcpu_load(vcpu); 3009 unsigned after_mxcsr_mask;
2774 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775 vcpu_put(vcpu);
2776
2777 return 0;
2778}
2779 3010
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 3011 /* Initialize guest FPU by resetting ours and saving into guest's */
2781 struct kvm_lapic_state *s) 3012 preempt_disable();
2782{ 3013 fx_save(&vcpu->arch.host_fx_image);
2783 vcpu_load(vcpu); 3014 fpu_init();
2784 memcpy(vcpu->apic->regs, s->regs, sizeof *s); 3015 fx_save(&vcpu->arch.guest_fx_image);
2785 kvm_apic_post_state_restore(vcpu); 3016 fx_restore(&vcpu->arch.host_fx_image);
2786 vcpu_put(vcpu); 3017 preempt_enable();
2787 3018
2788 return 0; 3019 vcpu->arch.cr0 |= X86_CR0_ET;
3020 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3021 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3022 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3023 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2789} 3024}
3025EXPORT_SYMBOL_GPL(fx_init);
2790 3026
2791static long kvm_vcpu_ioctl(struct file *filp, 3027void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2792 unsigned int ioctl, unsigned long arg)
2793{ 3028{
2794 struct kvm_vcpu *vcpu = filp->private_data; 3029 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2795 void __user *argp = (void __user *)arg; 3030 return;
2796 int r = -EINVAL;
2797
2798 switch (ioctl) {
2799 case KVM_RUN:
2800 r = -EINVAL;
2801 if (arg)
2802 goto out;
2803 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804 break;
2805 case KVM_GET_REGS: {
2806 struct kvm_regs kvm_regs;
2807
2808 memset(&kvm_regs, 0, sizeof kvm_regs);
2809 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810 if (r)
2811 goto out;
2812 r = -EFAULT;
2813 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814 goto out;
2815 r = 0;
2816 break;
2817 }
2818 case KVM_SET_REGS: {
2819 struct kvm_regs kvm_regs;
2820
2821 r = -EFAULT;
2822 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823 goto out;
2824 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825 if (r)
2826 goto out;
2827 r = 0;
2828 break;
2829 }
2830 case KVM_GET_SREGS: {
2831 struct kvm_sregs kvm_sregs;
2832
2833 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835 if (r)
2836 goto out;
2837 r = -EFAULT;
2838 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839 goto out;
2840 r = 0;
2841 break;
2842 }
2843 case KVM_SET_SREGS: {
2844 struct kvm_sregs kvm_sregs;
2845
2846 r = -EFAULT;
2847 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848 goto out;
2849 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850 if (r)
2851 goto out;
2852 r = 0;
2853 break;
2854 }
2855 case KVM_TRANSLATE: {
2856 struct kvm_translation tr;
2857
2858 r = -EFAULT;
2859 if (copy_from_user(&tr, argp, sizeof tr))
2860 goto out;
2861 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862 if (r)
2863 goto out;
2864 r = -EFAULT;
2865 if (copy_to_user(argp, &tr, sizeof tr))
2866 goto out;
2867 r = 0;
2868 break;
2869 }
2870 case KVM_INTERRUPT: {
2871 struct kvm_interrupt irq;
2872
2873 r = -EFAULT;
2874 if (copy_from_user(&irq, argp, sizeof irq))
2875 goto out;
2876 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877 if (r)
2878 goto out;
2879 r = 0;
2880 break;
2881 }
2882 case KVM_DEBUG_GUEST: {
2883 struct kvm_debug_guest dbg;
2884
2885 r = -EFAULT;
2886 if (copy_from_user(&dbg, argp, sizeof dbg))
2887 goto out;
2888 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889 if (r)
2890 goto out;
2891 r = 0;
2892 break;
2893 }
2894 case KVM_GET_MSRS:
2895 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896 break;
2897 case KVM_SET_MSRS:
2898 r = msr_io(vcpu, argp, do_set_msr, 0);
2899 break;
2900 case KVM_SET_CPUID: {
2901 struct kvm_cpuid __user *cpuid_arg = argp;
2902 struct kvm_cpuid cpuid;
2903
2904 r = -EFAULT;
2905 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906 goto out;
2907 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908 if (r)
2909 goto out;
2910 break;
2911 }
2912 case KVM_SET_SIGNAL_MASK: {
2913 struct kvm_signal_mask __user *sigmask_arg = argp;
2914 struct kvm_signal_mask kvm_sigmask;
2915 sigset_t sigset, *p;
2916
2917 p = NULL;
2918 if (argp) {
2919 r = -EFAULT;
2920 if (copy_from_user(&kvm_sigmask, argp,
2921 sizeof kvm_sigmask))
2922 goto out;
2923 r = -EINVAL;
2924 if (kvm_sigmask.len != sizeof sigset)
2925 goto out;
2926 r = -EFAULT;
2927 if (copy_from_user(&sigset, sigmask_arg->sigset,
2928 sizeof sigset))
2929 goto out;
2930 p = &sigset;
2931 }
2932 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933 break;
2934 }
2935 case KVM_GET_FPU: {
2936 struct kvm_fpu fpu;
2937
2938 memset(&fpu, 0, sizeof fpu);
2939 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940 if (r)
2941 goto out;
2942 r = -EFAULT;
2943 if (copy_to_user(argp, &fpu, sizeof fpu))
2944 goto out;
2945 r = 0;
2946 break;
2947 }
2948 case KVM_SET_FPU: {
2949 struct kvm_fpu fpu;
2950
2951 r = -EFAULT;
2952 if (copy_from_user(&fpu, argp, sizeof fpu))
2953 goto out;
2954 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955 if (r)
2956 goto out;
2957 r = 0;
2958 break;
2959 }
2960 case KVM_GET_LAPIC: {
2961 struct kvm_lapic_state lapic;
2962
2963 memset(&lapic, 0, sizeof lapic);
2964 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965 if (r)
2966 goto out;
2967 r = -EFAULT;
2968 if (copy_to_user(argp, &lapic, sizeof lapic))
2969 goto out;
2970 r = 0;
2971 break;
2972 }
2973 case KVM_SET_LAPIC: {
2974 struct kvm_lapic_state lapic;
2975 3031
2976 r = -EFAULT; 3032 vcpu->guest_fpu_loaded = 1;
2977 if (copy_from_user(&lapic, argp, sizeof lapic)) 3033 fx_save(&vcpu->arch.host_fx_image);
2978 goto out; 3034 fx_restore(&vcpu->arch.guest_fx_image);
2979 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980 if (r)
2981 goto out;
2982 r = 0;
2983 break;
2984 }
2985 default:
2986 ;
2987 }
2988out:
2989 return r;
2990} 3035}
3036EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2991 3037
2992static long kvm_vm_ioctl(struct file *filp, 3038void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2993 unsigned int ioctl, unsigned long arg)
2994{ 3039{
2995 struct kvm *kvm = filp->private_data; 3040 if (!vcpu->guest_fpu_loaded)
2996 void __user *argp = (void __user *)arg; 3041 return;
2997 int r = -EINVAL;
2998
2999 switch (ioctl) {
3000 case KVM_CREATE_VCPU:
3001 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002 if (r < 0)
3003 goto out;
3004 break;
3005 case KVM_SET_MEMORY_REGION: {
3006 struct kvm_memory_region kvm_mem;
3007
3008 r = -EFAULT;
3009 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010 goto out;
3011 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012 if (r)
3013 goto out;
3014 break;
3015 }
3016 case KVM_GET_DIRTY_LOG: {
3017 struct kvm_dirty_log log;
3018
3019 r = -EFAULT;
3020 if (copy_from_user(&log, argp, sizeof log))
3021 goto out;
3022 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023 if (r)
3024 goto out;
3025 break;
3026 }
3027 case KVM_SET_MEMORY_ALIAS: {
3028 struct kvm_memory_alias alias;
3029
3030 r = -EFAULT;
3031 if (copy_from_user(&alias, argp, sizeof alias))
3032 goto out;
3033 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034 if (r)
3035 goto out;
3036 break;
3037 }
3038 case KVM_CREATE_IRQCHIP:
3039 r = -ENOMEM;
3040 kvm->vpic = kvm_create_pic(kvm);
3041 if (kvm->vpic) {
3042 r = kvm_ioapic_init(kvm);
3043 if (r) {
3044 kfree(kvm->vpic);
3045 kvm->vpic = NULL;
3046 goto out;
3047 }
3048 }
3049 else
3050 goto out;
3051 break;
3052 case KVM_IRQ_LINE: {
3053 struct kvm_irq_level irq_event;
3054
3055 r = -EFAULT;
3056 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057 goto out;
3058 if (irqchip_in_kernel(kvm)) {
3059 mutex_lock(&kvm->lock);
3060 if (irq_event.irq < 16)
3061 kvm_pic_set_irq(pic_irqchip(kvm),
3062 irq_event.irq,
3063 irq_event.level);
3064 kvm_ioapic_set_irq(kvm->vioapic,
3065 irq_event.irq,
3066 irq_event.level);
3067 mutex_unlock(&kvm->lock);
3068 r = 0;
3069 }
3070 break;
3071 }
3072 case KVM_GET_IRQCHIP: {
3073 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074 struct kvm_irqchip chip;
3075
3076 r = -EFAULT;
3077 if (copy_from_user(&chip, argp, sizeof chip))
3078 goto out;
3079 r = -ENXIO;
3080 if (!irqchip_in_kernel(kvm))
3081 goto out;
3082 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083 if (r)
3084 goto out;
3085 r = -EFAULT;
3086 if (copy_to_user(argp, &chip, sizeof chip))
3087 goto out;
3088 r = 0;
3089 break;
3090 }
3091 case KVM_SET_IRQCHIP: {
3092 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093 struct kvm_irqchip chip;
3094 3042
3095 r = -EFAULT; 3043 vcpu->guest_fpu_loaded = 0;
3096 if (copy_from_user(&chip, argp, sizeof chip)) 3044 fx_save(&vcpu->arch.guest_fx_image);
3097 goto out; 3045 fx_restore(&vcpu->arch.host_fx_image);
3098 r = -ENXIO; 3046 ++vcpu->stat.fpu_reload;
3099 if (!irqchip_in_kernel(kvm))
3100 goto out;
3101 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102 if (r)
3103 goto out;
3104 r = 0;
3105 break;
3106 }
3107 default:
3108 ;
3109 }
3110out:
3111 return r;
3112} 3047}
3048EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3113 3049
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma, 3050void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3115 unsigned long address,
3116 int *type)
3117{ 3051{
3118 struct kvm *kvm = vma->vm_file->private_data; 3052 kvm_x86_ops->vcpu_free(vcpu);
3119 unsigned long pgoff;
3120 struct page *page;
3121
3122 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123 page = gfn_to_page(kvm, pgoff);
3124 if (!page)
3125 return NOPAGE_SIGBUS;
3126 get_page(page);
3127 if (type != NULL)
3128 *type = VM_FAULT_MINOR;
3129
3130 return page;
3131} 3053}
3132 3054
3133static struct vm_operations_struct kvm_vm_vm_ops = { 3055struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3134 .nopage = kvm_vm_nopage, 3056 unsigned int id)
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{ 3057{
3139 vma->vm_ops = &kvm_vm_vm_ops; 3058 return kvm_x86_ops->vcpu_create(kvm, id);
3140 return 0;
3141} 3059}
3142 3060
3143static struct file_operations kvm_vm_fops = { 3061int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3144 .release = kvm_vm_release,
3145 .unlocked_ioctl = kvm_vm_ioctl,
3146 .compat_ioctl = kvm_vm_ioctl,
3147 .mmap = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{ 3062{
3152 int fd, r; 3063 int r;
3153 struct inode *inode;
3154 struct file *file;
3155 struct kvm *kvm;
3156 3064
3157 kvm = kvm_create_vm(); 3065 /* We do fxsave: this must be aligned. */
3158 if (IS_ERR(kvm)) 3066 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3159 return PTR_ERR(kvm);
3160 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161 if (r) {
3162 kvm_destroy_vm(kvm);
3163 return r;
3164 }
3165 3067
3166 kvm->filp = file; 3068 vcpu_load(vcpu);
3069 r = kvm_arch_vcpu_reset(vcpu);
3070 if (r == 0)
3071 r = kvm_mmu_setup(vcpu);
3072 vcpu_put(vcpu);
3073 if (r < 0)
3074 goto free_vcpu;
3167 3075
3168 return fd; 3076 return 0;
3077free_vcpu:
3078 kvm_x86_ops->vcpu_free(vcpu);
3079 return r;
3169} 3080}
3170 3081
3171static long kvm_dev_ioctl(struct file *filp, 3082void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3172 unsigned int ioctl, unsigned long arg)
3173{ 3083{
3174 void __user *argp = (void __user *)arg; 3084 vcpu_load(vcpu);
3175 long r = -EINVAL; 3085 kvm_mmu_unload(vcpu);
3176 3086 vcpu_put(vcpu);
3177 switch (ioctl) {
3178 case KVM_GET_API_VERSION:
3179 r = -EINVAL;
3180 if (arg)
3181 goto out;
3182 r = KVM_API_VERSION;
3183 break;
3184 case KVM_CREATE_VM:
3185 r = -EINVAL;
3186 if (arg)
3187 goto out;
3188 r = kvm_dev_ioctl_create_vm();
3189 break;
3190 case KVM_GET_MSR_INDEX_LIST: {
3191 struct kvm_msr_list __user *user_msr_list = argp;
3192 struct kvm_msr_list msr_list;
3193 unsigned n;
3194
3195 r = -EFAULT;
3196 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197 goto out;
3198 n = msr_list.nmsrs;
3199 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201 goto out;
3202 r = -E2BIG;
3203 if (n < num_msrs_to_save)
3204 goto out;
3205 r = -EFAULT;
3206 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207 num_msrs_to_save * sizeof(u32)))
3208 goto out;
3209 if (copy_to_user(user_msr_list->indices
3210 + num_msrs_to_save * sizeof(u32),
3211 &emulated_msrs,
3212 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213 goto out;
3214 r = 0;
3215 break;
3216 }
3217 case KVM_CHECK_EXTENSION: {
3218 int ext = (long)argp;
3219 3087
3220 switch (ext) { 3088 kvm_x86_ops->vcpu_free(vcpu);
3221 case KVM_CAP_IRQCHIP:
3222 case KVM_CAP_HLT:
3223 r = 1;
3224 break;
3225 default:
3226 r = 0;
3227 break;
3228 }
3229 break;
3230 }
3231 case KVM_GET_VCPU_MMAP_SIZE:
3232 r = -EINVAL;
3233 if (arg)
3234 goto out;
3235 r = 2 * PAGE_SIZE;
3236 break;
3237 default:
3238 ;
3239 }
3240out:
3241 return r;
3242} 3089}
3243 3090
3244static struct file_operations kvm_chardev_ops = { 3091int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3245 .unlocked_ioctl = kvm_dev_ioctl,
3246 .compat_ioctl = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250 KVM_MINOR,
3251 "kvm",
3252 &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{ 3092{
3261 struct kvm *vm; 3093 return kvm_x86_ops->vcpu_reset(vcpu);
3262 struct kvm_vcpu *vcpu;
3263 int i;
3264
3265 spin_lock(&kvm_lock);
3266 list_for_each_entry(vm, &vm_list, vm_list)
3267 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268 vcpu = vm->vcpus[i];
3269 if (!vcpu)
3270 continue;
3271 /*
3272 * If the vcpu is locked, then it is running on some
3273 * other cpu and therefore it is not cached on the
3274 * cpu in question.
3275 *
3276 * If it's not locked, check the last cpu it executed
3277 * on.
3278 */
3279 if (mutex_trylock(&vcpu->mutex)) {
3280 if (vcpu->cpu == cpu) {
3281 kvm_x86_ops->vcpu_decache(vcpu);
3282 vcpu->cpu = -1;
3283 }
3284 mutex_unlock(&vcpu->mutex);
3285 }
3286 }
3287 spin_unlock(&kvm_lock);
3288} 3094}
3289 3095
3290static void hardware_enable(void *junk) 3096void kvm_arch_hardware_enable(void *garbage)
3291{ 3097{
3292 int cpu = raw_smp_processor_id(); 3098 kvm_x86_ops->hardware_enable(garbage);
3293
3294 if (cpu_isset(cpu, cpus_hardware_enabled))
3295 return;
3296 cpu_set(cpu, cpus_hardware_enabled);
3297 kvm_x86_ops->hardware_enable(NULL);
3298} 3099}
3299 3100
3300static void hardware_disable(void *junk) 3101void kvm_arch_hardware_disable(void *garbage)
3301{ 3102{
3302 int cpu = raw_smp_processor_id(); 3103 kvm_x86_ops->hardware_disable(garbage);
3303
3304 if (!cpu_isset(cpu, cpus_hardware_enabled))
3305 return;
3306 cpu_clear(cpu, cpus_hardware_enabled);
3307 decache_vcpus_on_cpu(cpu);
3308 kvm_x86_ops->hardware_disable(NULL);
3309} 3104}
3310 3105
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3106int kvm_arch_hardware_setup(void)
3312 void *v)
3313{ 3107{
3314 int cpu = (long)v; 3108 return kvm_x86_ops->hardware_setup();
3315
3316 switch (val) {
3317 case CPU_DYING:
3318 case CPU_DYING_FROZEN:
3319 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320 cpu);
3321 hardware_disable(NULL);
3322 break;
3323 case CPU_UP_CANCELED:
3324 case CPU_UP_CANCELED_FROZEN:
3325 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326 cpu);
3327 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328 break;
3329 case CPU_ONLINE:
3330 case CPU_ONLINE_FROZEN:
3331 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332 cpu);
3333 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334 break;
3335 }
3336 return NOTIFY_OK;
3337} 3109}
3338 3110
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3111void kvm_arch_hardware_unsetup(void)
3340 void *v)
3341{ 3112{
3342 if (val == SYS_RESTART) { 3113 kvm_x86_ops->hardware_unsetup();
3343 /*
3344 * Some (well, at least mine) BIOSes hang on reboot if
3345 * in vmx root mode.
3346 */
3347 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348 on_each_cpu(hardware_disable, NULL, 0, 1);
3349 }
3350 return NOTIFY_OK;
3351} 3114}
3352 3115
3353static struct notifier_block kvm_reboot_notifier = { 3116void kvm_arch_check_processor_compat(void *rtn)
3354 .notifier_call = kvm_reboot,
3355 .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{ 3117{
3360 memset(bus, 0, sizeof(*bus)); 3118 kvm_x86_ops->check_processor_compatibility(rtn);
3361} 3119}
3362 3120
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3121int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3364{ 3122{
3365 int i; 3123 struct page *page;
3124 struct kvm *kvm;
3125 int r;
3366 3126
3367 for (i = 0; i < bus->dev_count; i++) { 3127 BUG_ON(vcpu->kvm == NULL);
3368 struct kvm_io_device *pos = bus->devs[i]; 3128 kvm = vcpu->kvm;
3369 3129
3370 kvm_iodevice_destructor(pos); 3130 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3371 } 3131 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3372} 3132 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3133 else
3134 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3373 3135
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) 3136 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3375{ 3137 if (!page) {
3376 int i; 3138 r = -ENOMEM;
3139 goto fail;
3140 }
3141 vcpu->arch.pio_data = page_address(page);
3377 3142
3378 for (i = 0; i < bus->dev_count; i++) { 3143 r = kvm_mmu_create(vcpu);
3379 struct kvm_io_device *pos = bus->devs[i]; 3144 if (r < 0)
3145 goto fail_free_pio_data;
3380 3146
3381 if (pos->in_range(pos, addr)) 3147 if (irqchip_in_kernel(kvm)) {
3382 return pos; 3148 r = kvm_create_lapic(vcpu);
3149 if (r < 0)
3150 goto fail_mmu_destroy;
3383 } 3151 }
3384 3152
3385 return NULL; 3153 return 0;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391 3154
3392 bus->devs[bus->dev_count++] = dev; 3155fail_mmu_destroy:
3156 kvm_mmu_destroy(vcpu);
3157fail_free_pio_data:
3158 free_page((unsigned long)vcpu->arch.pio_data);
3159fail:
3160 return r;
3393} 3161}
3394 3162
3395static struct notifier_block kvm_cpu_notifier = { 3163void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3396 .notifier_call = kvm_cpu_hotplug,
3397 .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{ 3164{
3402 unsigned offset = (long)_offset; 3165 kvm_free_lapic(vcpu);
3403 u64 total = 0; 3166 kvm_mmu_destroy(vcpu);
3404 struct kvm *kvm; 3167 free_page((unsigned long)vcpu->arch.pio_data);
3405 struct kvm_vcpu *vcpu;
3406 int i;
3407
3408 spin_lock(&kvm_lock);
3409 list_for_each_entry(kvm, &vm_list, vm_list)
3410 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411 vcpu = kvm->vcpus[i];
3412 if (vcpu)
3413 total += *(u32 *)((void *)vcpu + offset);
3414 }
3415 spin_unlock(&kvm_lock);
3416 return total;
3417} 3168}
3418 3169
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); 3170struct kvm *kvm_arch_create_vm(void)
3420
3421static __init void kvm_init_debug(void)
3422{ 3171{
3423 struct kvm_stats_debugfs_item *p; 3172 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3424
3425 debugfs_dir = debugfs_create_dir("kvm", NULL);
3426 for (p = debugfs_entries; p->name; ++p)
3427 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428 (void *)(long)p->offset,
3429 &stat_fops);
3430}
3431 3173
3432static void kvm_exit_debug(void) 3174 if (!kvm)
3433{ 3175 return ERR_PTR(-ENOMEM);
3434 struct kvm_stats_debugfs_item *p;
3435 3176
3436 for (p = debugfs_entries; p->name; ++p) 3177 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3437 debugfs_remove(p->dentry);
3438 debugfs_remove(debugfs_dir);
3439}
3440 3178
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state) 3179 return kvm;
3442{
3443 hardware_disable(NULL);
3444 return 0;
3445} 3180}
3446 3181
3447static int kvm_resume(struct sys_device *dev) 3182static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3448{ 3183{
3449 hardware_enable(NULL); 3184 vcpu_load(vcpu);
3450 return 0; 3185 kvm_mmu_unload(vcpu);
3186 vcpu_put(vcpu);
3451} 3187}
3452 3188
3453static struct sysdev_class kvm_sysdev_class = { 3189static void kvm_free_vcpus(struct kvm *kvm)
3454 .name = "kvm",
3455 .suspend = kvm_suspend,
3456 .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460 .id = 0,
3461 .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{ 3190{
3469 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3191 unsigned int i;
3470}
3471 3192
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3193 /*
3473{ 3194 * Unpin any mmu pages first.
3474 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3195 */
3196 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3197 if (kvm->vcpus[i])
3198 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3199 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3200 if (kvm->vcpus[i]) {
3201 kvm_arch_vcpu_free(kvm->vcpus[i]);
3202 kvm->vcpus[i] = NULL;
3203 }
3204 }
3475 3205
3476 kvm_x86_ops->vcpu_load(vcpu, cpu);
3477} 3206}
3478 3207
3479static void kvm_sched_out(struct preempt_notifier *pn, 3208void kvm_arch_destroy_vm(struct kvm *kvm)
3480 struct task_struct *next)
3481{ 3209{
3482 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3210 kfree(kvm->arch.vpic);
3483 3211 kfree(kvm->arch.vioapic);
3484 kvm_x86_ops->vcpu_put(vcpu); 3212 kvm_free_vcpus(kvm);
3213 kvm_free_physmem(kvm);
3214 kfree(kvm);
3485} 3215}
3486 3216
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, 3217int kvm_arch_set_memory_region(struct kvm *kvm,
3488 struct module *module) 3218 struct kvm_userspace_memory_region *mem,
3219 struct kvm_memory_slot old,
3220 int user_alloc)
3489{ 3221{
3490 int r; 3222 int npages = mem->memory_size >> PAGE_SHIFT;
3491 int cpu; 3223 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3492
3493 if (kvm_x86_ops) {
3494 printk(KERN_ERR "kvm: already loaded the other module\n");
3495 return -EEXIST;
3496 }
3497 3224
3498 if (!ops->cpu_has_kvm_support()) { 3225 /*To keep backward compatibility with older userspace,
3499 printk(KERN_ERR "kvm: no hardware support\n"); 3226 *x86 needs to hanlde !user_alloc case.
3500 return -EOPNOTSUPP; 3227 */
3501 } 3228 if (!user_alloc) {
3502 if (ops->disabled_by_bios()) { 3229 if (npages && !old.rmap) {
3503 printk(KERN_ERR "kvm: disabled by bios\n"); 3230 memslot->userspace_addr = do_mmap(NULL, 0,
3504 return -EOPNOTSUPP; 3231 npages * PAGE_SIZE,
3505 } 3232 PROT_READ | PROT_WRITE,
3506 3233 MAP_SHARED | MAP_ANONYMOUS,
3507 kvm_x86_ops = ops; 3234 0);
3508 3235
3509 r = kvm_x86_ops->hardware_setup(); 3236 if (IS_ERR((void *)memslot->userspace_addr))
3510 if (r < 0) 3237 return PTR_ERR((void *)memslot->userspace_addr);
3511 goto out; 3238 } else {
3512 3239 if (!old.user_alloc && old.rmap) {
3513 for_each_online_cpu(cpu) { 3240 int ret;
3514 smp_call_function_single(cpu, 3241
3515 kvm_x86_ops->check_processor_compatibility, 3242 ret = do_munmap(current->mm, old.userspace_addr,
3516 &r, 0, 1); 3243 old.npages * PAGE_SIZE);
3517 if (r < 0) 3244 if (ret < 0)
3518 goto out_free_0; 3245 printk(KERN_WARNING
3519 } 3246 "kvm_vm_ioctl_set_memory_region: "
3520 3247 "failed to munmap memory\n");
3521 on_each_cpu(hardware_enable, NULL, 0, 1); 3248 }
3522 r = register_cpu_notifier(&kvm_cpu_notifier); 3249 }
3523 if (r)
3524 goto out_free_1;
3525 register_reboot_notifier(&kvm_reboot_notifier);
3526
3527 r = sysdev_class_register(&kvm_sysdev_class);
3528 if (r)
3529 goto out_free_2;
3530
3531 r = sysdev_register(&kvm_sysdev);
3532 if (r)
3533 goto out_free_3;
3534
3535 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537 __alignof__(struct kvm_vcpu), 0, 0);
3538 if (!kvm_vcpu_cache) {
3539 r = -ENOMEM;
3540 goto out_free_4;
3541 } 3250 }
3542 3251
3543 kvm_chardev_ops.owner = module; 3252 if (!kvm->arch.n_requested_mmu_pages) {
3544 3253 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3545 r = misc_register(&kvm_dev); 3254 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3546 if (r) {
3547 printk (KERN_ERR "kvm: misc device register failed\n");
3548 goto out_free;
3549 } 3255 }
3550 3256
3551 kvm_preempt_ops.sched_in = kvm_sched_in; 3257 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3552 kvm_preempt_ops.sched_out = kvm_sched_out; 3258 kvm_flush_remote_tlbs(kvm);
3553
3554 return r;
3555 3259
3556out_free: 3260 return 0;
3557 kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559 sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561 sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563 unregister_reboot_notifier(&kvm_reboot_notifier);
3564 unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566 on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568 kvm_x86_ops->hardware_unsetup();
3569out:
3570 kvm_x86_ops = NULL;
3571 return r;
3572} 3261}
3573 3262
3574void kvm_exit_x86(void) 3263int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3575{ 3264{
3576 misc_deregister(&kvm_dev); 3265 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3577 kmem_cache_destroy(kvm_vcpu_cache); 3266 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3578 sysdev_unregister(&kvm_sysdev);
3579 sysdev_class_unregister(&kvm_sysdev_class);
3580 unregister_reboot_notifier(&kvm_reboot_notifier);
3581 unregister_cpu_notifier(&kvm_cpu_notifier);
3582 on_each_cpu(hardware_disable, NULL, 0, 1);
3583 kvm_x86_ops->hardware_unsetup();
3584 kvm_x86_ops = NULL;
3585} 3267}
3586 3268
3587static __init int kvm_init(void) 3269static void vcpu_kick_intr(void *info)
3588{ 3270{
3589 static struct page *bad_page; 3271#ifdef DEBUG
3590 int r; 3272 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3591 3273 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3592 r = kvm_mmu_module_init(); 3274#endif
3593 if (r)
3594 goto out4;
3595
3596 kvm_init_debug();
3597
3598 kvm_init_msr_list();
3599
3600 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601 r = -ENOMEM;
3602 goto out;
3603 }
3604
3605 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606 memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608 return 0;
3609
3610out:
3611 kvm_exit_debug();
3612 kvm_mmu_module_exit();
3613out4:
3614 return r;
3615} 3275}
3616 3276
3617static __exit void kvm_exit(void) 3277void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3618{ 3278{
3619 kvm_exit_debug(); 3279 int ipi_pcpu = vcpu->cpu;
3620 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621 kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626 3280
3627EXPORT_SYMBOL_GPL(kvm_init_x86); 3281 if (waitqueue_active(&vcpu->wq)) {
3628EXPORT_SYMBOL_GPL(kvm_exit_x86); 3282 wake_up_interruptible(&vcpu->wq);
3283 ++vcpu->stat.halt_wakeup;
3284 }
3285 if (vcpu->guest_mode)
3286 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3287}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 000000000000..79586003397a
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 19626ace0f50..964dfa36d367 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,7 @@
1config LGUEST_GUEST 1config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32
4 depends on !X86_PAE 5 depends on !X86_PAE
5 depends on !(X86_VISWS || X86_VOYAGER) 6 depends on !(X86_VISWS || X86_VOYAGER)
6 select VIRTIO 7 select VIRTIO
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d6b18e2e5431..5afdde4895dc 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -176,8 +176,8 @@ static void lguest_leave_lazy_mode(void)
176 * check there when it wants to deliver an interrupt. 176 * check there when it wants to deliver an interrupt.
177 */ 177 */
178 178
179/* save_flags() is expected to return the processor state (ie. "eflags"). The 179/* save_flags() is expected to return the processor state (ie. "flags"). The
180 * eflags word contains all kind of stuff, but in practice Linux only cares 180 * flags word contains all kind of stuff, but in practice Linux only cares
181 * about the interrupt flag. Our "save_flags()" just returns that. */ 181 * about the interrupt flag. Our "save_flags()" just returns that. */
182static unsigned long save_fl(void) 182static unsigned long save_fl(void)
183{ 183{
@@ -218,19 +218,20 @@ static void irq_enable(void)
218 * address of the handler, and... well, who cares? The Guest just asks the 218 * address of the handler, and... well, who cares? The Guest just asks the
219 * Host to make the change anyway, because the Host controls the real IDT. 219 * Host to make the change anyway, because the Host controls the real IDT.
220 */ 220 */
221static void lguest_write_idt_entry(struct desc_struct *dt, 221static void lguest_write_idt_entry(gate_desc *dt,
222 int entrynum, u32 low, u32 high) 222 int entrynum, const gate_desc *g)
223{ 223{
224 u32 *desc = (u32 *)g;
224 /* Keep the local copy up to date. */ 225 /* Keep the local copy up to date. */
225 write_dt_entry(dt, entrynum, low, high); 226 native_write_idt_entry(dt, entrynum, g);
226 /* Tell Host about this new entry. */ 227 /* Tell Host about this new entry. */
227 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); 228 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
228} 229}
229 230
230/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 231/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
231 * time it is written, so we can simply loop through all entries and tell the 232 * time it is written, so we can simply loop through all entries and tell the
232 * Host about them. */ 233 * Host about them. */
233static void lguest_load_idt(const struct Xgt_desc_struct *desc) 234static void lguest_load_idt(const struct desc_ptr *desc)
234{ 235{
235 unsigned int i; 236 unsigned int i;
236 struct desc_struct *idt = (void *)desc->address; 237 struct desc_struct *idt = (void *)desc->address;
@@ -253,7 +254,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
253 * hypercall and use that repeatedly to load a new IDT. I don't think it 254 * hypercall and use that repeatedly to load a new IDT. I don't think it
254 * really matters, but wouldn't it be nice if they were the same? 255 * really matters, but wouldn't it be nice if they were the same?
255 */ 256 */
256static void lguest_load_gdt(const struct Xgt_desc_struct *desc) 257static void lguest_load_gdt(const struct desc_ptr *desc)
257{ 258{
258 BUG_ON((desc->size+1)/8 != GDT_ENTRIES); 259 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
259 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); 260 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
@@ -262,10 +263,10 @@ static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
262/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 263/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
263 * then tell the Host to reload the entire thing. This operation is so rare 264 * then tell the Host to reload the entire thing. This operation is so rare
264 * that this naive implementation is reasonable. */ 265 * that this naive implementation is reasonable. */
265static void lguest_write_gdt_entry(struct desc_struct *dt, 266static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
266 int entrynum, u32 low, u32 high) 267 const void *desc, int type)
267{ 268{
268 write_dt_entry(dt, entrynum, low, high); 269 native_write_gdt_entry(dt, entrynum, desc, type);
269 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); 270 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
270} 271}
271 272
@@ -324,30 +325,30 @@ static void lguest_load_tr_desc(void)
324 * anyone (including userspace) can just use the raw "cpuid" instruction and 325 * anyone (including userspace) can just use the raw "cpuid" instruction and
325 * the Host won't even notice since it isn't privileged. So we try not to get 326 * the Host won't even notice since it isn't privileged. So we try not to get
326 * too worked up about it. */ 327 * too worked up about it. */
327static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, 328static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
328 unsigned int *ecx, unsigned int *edx) 329 unsigned int *cx, unsigned int *dx)
329{ 330{
330 int function = *eax; 331 int function = *ax;
331 332
332 native_cpuid(eax, ebx, ecx, edx); 333 native_cpuid(ax, bx, cx, dx);
333 switch (function) { 334 switch (function) {
334 case 1: /* Basic feature request. */ 335 case 1: /* Basic feature request. */
335 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 336 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
336 *ecx &= 0x00002201; 337 *cx &= 0x00002201;
337 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 338 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
338 *edx &= 0x07808101; 339 *dx &= 0x07808101;
339 /* The Host can do a nice optimization if it knows that the 340 /* The Host can do a nice optimization if it knows that the
340 * kernel mappings (addresses above 0xC0000000 or whatever 341 * kernel mappings (addresses above 0xC0000000 or whatever
341 * PAGE_OFFSET is set to) haven't changed. But Linux calls 342 * PAGE_OFFSET is set to) haven't changed. But Linux calls
342 * flush_tlb_user() for both user and kernel mappings unless 343 * flush_tlb_user() for both user and kernel mappings unless
343 * the Page Global Enable (PGE) feature bit is set. */ 344 * the Page Global Enable (PGE) feature bit is set. */
344 *edx |= 0x00002000; 345 *dx |= 0x00002000;
345 break; 346 break;
346 case 0x80000000: 347 case 0x80000000:
347 /* Futureproof this a little: if they ask how much extended 348 /* Futureproof this a little: if they ask how much extended
348 * processor information there is, limit it to known fields. */ 349 * processor information there is, limit it to known fields. */
349 if (*eax > 0x80000008) 350 if (*ax > 0x80000008)
350 *eax = 0x80000008; 351 *ax = 0x80000008;
351 break; 352 break;
352 } 353 }
353} 354}
@@ -756,10 +757,10 @@ static void lguest_time_init(void)
756 * segment), the privilege level (we're privilege level 1, the Host is 0 and 757 * segment), the privilege level (we're privilege level 1, the Host is 0 and
757 * will not tolerate us trying to use that), the stack pointer, and the number 758 * will not tolerate us trying to use that), the stack pointer, and the number
758 * of pages in the stack. */ 759 * of pages in the stack. */
759static void lguest_load_esp0(struct tss_struct *tss, 760static void lguest_load_sp0(struct tss_struct *tss,
760 struct thread_struct *thread) 761 struct thread_struct *thread)
761{ 762{
762 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, 763 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
763 THREAD_SIZE/PAGE_SIZE); 764 THREAD_SIZE/PAGE_SIZE);
764} 765}
765 766
@@ -789,11 +790,11 @@ static void lguest_wbinvd(void)
789 * code qualifies for Advanced. It will also never interrupt anything. It 790 * code qualifies for Advanced. It will also never interrupt anything. It
790 * does, however, allow us to get through the Linux boot code. */ 791 * does, however, allow us to get through the Linux boot code. */
791#ifdef CONFIG_X86_LOCAL_APIC 792#ifdef CONFIG_X86_LOCAL_APIC
792static void lguest_apic_write(unsigned long reg, unsigned long v) 793static void lguest_apic_write(unsigned long reg, u32 v)
793{ 794{
794} 795}
795 796
796static unsigned long lguest_apic_read(unsigned long reg) 797static u32 lguest_apic_read(unsigned long reg)
797{ 798{
798 return 0; 799 return 0;
799} 800}
@@ -963,7 +964,7 @@ __init void lguest_init(void)
963 pv_cpu_ops.cpuid = lguest_cpuid; 964 pv_cpu_ops.cpuid = lguest_cpuid;
964 pv_cpu_ops.load_idt = lguest_load_idt; 965 pv_cpu_ops.load_idt = lguest_load_idt;
965 pv_cpu_ops.iret = lguest_iret; 966 pv_cpu_ops.iret = lguest_iret;
966 pv_cpu_ops.load_esp0 = lguest_load_esp0; 967 pv_cpu_ops.load_sp0 = lguest_load_sp0;
967 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; 968 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
968 pv_cpu_ops.set_ldt = lguest_set_ldt; 969 pv_cpu_ops.set_ldt = lguest_set_ldt;
969 pv_cpu_ops.load_tls = lguest_load_tls; 970 pv_cpu_ops.load_tls = lguest_load_tls;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 329da276c6f1..4876182daf8a 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,5 +1,27 @@
1#
2# Makefile for x86 specific library files.
3#
4
5obj-$(CONFIG_SMP) := msr-on-cpu.o
6
7lib-y := delay_$(BITS).o
8lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
9lib-y += memcpy_$(BITS).o
10
1ifeq ($(CONFIG_X86_32),y) 11ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/lib/Makefile_32 12 lib-y += checksum_32.o
13 lib-y += strstr_32.o
14 lib-y += bitops_32.o semaphore_32.o string_32.o
15
16 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
3else 17else
4include ${srctree}/arch/x86/lib/Makefile_64 18 obj-y += io_64.o iomap_copy_64.o
19
20 CFLAGS_csum-partial_64.o := -funroll-loops
21
22 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
23 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
24 lib-y += bitstr_64.o bitops_64.o
25 lib-y += memmove_64.o memset_64.o
26 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
5endif 27endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
deleted file mode 100644
index 98d1f1e2e2ef..000000000000
--- a/arch/x86/lib/Makefile_32
+++ /dev/null
@@ -1,11 +0,0 @@
1#
2# Makefile for i386-specific library files..
3#
4
5
6lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
7 bitops_32.o semaphore_32.o string_32.o
8
9lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
10
11obj-$(CONFIG_SMP) += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
deleted file mode 100644
index bbabad3c9335..000000000000
--- a/arch/x86/lib/Makefile_64
+++ /dev/null
@@ -1,13 +0,0 @@
1#
2# Makefile for x86_64-specific library files.
3#
4
5CFLAGS_csum-partial_64.o := -funroll-loops
6
7obj-y := io_64.o iomap_copy_64.o
8obj-$(CONFIG_SMP) += msr-on-cpu.o
9
10lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
11 usercopy_64.o getuser_64.o putuser_64.o \
12 thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
13lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 8ac51b82a632..37756b6fb329 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n)
34 "cld" 34 "cld"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 : "=&c" (d0), "=&S" (d1), "=&D" (d2)
36 :"0" (n), 36 :"0" (n),
37 "1" (n-1+(const char *)src), 37 "1" (n-1+src),
38 "2" (n-1+(char *)dest) 38 "2" (n-1+dest)
39 :"memory"); 39 :"memory");
40 } 40 }
41 return dest; 41 return dest;
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 751ebae8ec42..80175e47b190 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count)
11 if (dest < src) { 11 if (dest < src) {
12 return memcpy(dest,src,count); 12 return memcpy(dest,src,count);
13 } else { 13 } else {
14 char *p = (char *) dest + count; 14 char *p = dest + count;
15 char *s = (char *) src + count; 15 const char *s = src + count;
16 while (count--) 16 while (count--)
17 *--p = *--s; 17 *--p = *--s;
18 } 18 }
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 444fba400983..3899bd37fdf0 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -29,7 +29,7 @@
29 * registers (%eax, %edx and %ecx) except %eax whish is either a return 29 * registers (%eax, %edx and %ecx) except %eax whish is either a return
30 * value or just clobbered.. 30 * value or just clobbered..
31 */ 31 */
32 .section .sched.text 32 .section .sched.text, "ax"
33ENTRY(__down_failed) 33ENTRY(__down_failed)
34 CFI_STARTPROC 34 CFI_STARTPROC
35 FRAME 35 FRAME
@@ -49,7 +49,7 @@ ENTRY(__down_failed)
49 ENDFRAME 49 ENDFRAME
50 ret 50 ret
51 CFI_ENDPROC 51 CFI_ENDPROC
52 END(__down_failed) 52 ENDPROC(__down_failed)
53 53
54ENTRY(__down_failed_interruptible) 54ENTRY(__down_failed_interruptible)
55 CFI_STARTPROC 55 CFI_STARTPROC
@@ -70,7 +70,7 @@ ENTRY(__down_failed_interruptible)
70 ENDFRAME 70 ENDFRAME
71 ret 71 ret
72 CFI_ENDPROC 72 CFI_ENDPROC
73 END(__down_failed_interruptible) 73 ENDPROC(__down_failed_interruptible)
74 74
75ENTRY(__down_failed_trylock) 75ENTRY(__down_failed_trylock)
76 CFI_STARTPROC 76 CFI_STARTPROC
@@ -91,7 +91,7 @@ ENTRY(__down_failed_trylock)
91 ENDFRAME 91 ENDFRAME
92 ret 92 ret
93 CFI_ENDPROC 93 CFI_ENDPROC
94 END(__down_failed_trylock) 94 ENDPROC(__down_failed_trylock)
95 95
96ENTRY(__up_wakeup) 96ENTRY(__up_wakeup)
97 CFI_STARTPROC 97 CFI_STARTPROC
@@ -112,7 +112,7 @@ ENTRY(__up_wakeup)
112 ENDFRAME 112 ENDFRAME
113 ret 113 ret
114 CFI_ENDPROC 114 CFI_ENDPROC
115 END(__up_wakeup) 115 ENDPROC(__up_wakeup)
116 116
117/* 117/*
118 * rw spinlock fallbacks 118 * rw spinlock fallbacks
@@ -132,7 +132,7 @@ ENTRY(__write_lock_failed)
132 ENDFRAME 132 ENDFRAME
133 ret 133 ret
134 CFI_ENDPROC 134 CFI_ENDPROC
135 END(__write_lock_failed) 135 ENDPROC(__write_lock_failed)
136 136
137ENTRY(__read_lock_failed) 137ENTRY(__read_lock_failed)
138 CFI_STARTPROC 138 CFI_STARTPROC
@@ -148,7 +148,7 @@ ENTRY(__read_lock_failed)
148 ENDFRAME 148 ENDFRAME
149 ret 149 ret
150 CFI_ENDPROC 150 CFI_ENDPROC
151 END(__read_lock_failed) 151 ENDPROC(__read_lock_failed)
152 152
153#endif 153#endif
154 154
@@ -170,7 +170,7 @@ ENTRY(call_rwsem_down_read_failed)
170 CFI_ADJUST_CFA_OFFSET -4 170 CFI_ADJUST_CFA_OFFSET -4
171 ret 171 ret
172 CFI_ENDPROC 172 CFI_ENDPROC
173 END(call_rwsem_down_read_failed) 173 ENDPROC(call_rwsem_down_read_failed)
174 174
175ENTRY(call_rwsem_down_write_failed) 175ENTRY(call_rwsem_down_write_failed)
176 CFI_STARTPROC 176 CFI_STARTPROC
@@ -182,7 +182,7 @@ ENTRY(call_rwsem_down_write_failed)
182 CFI_ADJUST_CFA_OFFSET -4 182 CFI_ADJUST_CFA_OFFSET -4
183 ret 183 ret
184 CFI_ENDPROC 184 CFI_ENDPROC
185 END(call_rwsem_down_write_failed) 185 ENDPROC(call_rwsem_down_write_failed)
186 186
187ENTRY(call_rwsem_wake) 187ENTRY(call_rwsem_wake)
188 CFI_STARTPROC 188 CFI_STARTPROC
@@ -196,7 +196,7 @@ ENTRY(call_rwsem_wake)
196 CFI_ADJUST_CFA_OFFSET -4 196 CFI_ADJUST_CFA_OFFSET -4
1971: ret 1971: ret
198 CFI_ENDPROC 198 CFI_ENDPROC
199 END(call_rwsem_wake) 199 ENDPROC(call_rwsem_wake)
200 200
201/* Fix up special calling conventions */ 201/* Fix up special calling conventions */
202ENTRY(call_rwsem_downgrade_wake) 202ENTRY(call_rwsem_downgrade_wake)
@@ -214,6 +214,6 @@ ENTRY(call_rwsem_downgrade_wake)
214 CFI_ADJUST_CFA_OFFSET -4 214 CFI_ADJUST_CFA_OFFSET -4
215 ret 215 ret
216 CFI_ENDPROC 216 CFI_ENDPROC
217 END(call_rwsem_downgrade_wake) 217 ENDPROC(call_rwsem_downgrade_wake)
218 218
219#endif 219#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 6ea73f3de567..8b92d428ab02 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -33,7 +33,7 @@
33 .endm 33 .endm
34 34
35 35
36 .section .sched.text 36 .section .sched.text, "ax"
37#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM 37#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
38 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed 38 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
39 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed 39 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
new file mode 100644
index 000000000000..1faac8125e3d
--- /dev/null
+++ b/arch/x86/mach-rdc321x/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the RDC321x specific parts of the kernel
3#
4obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o wdt.o
5
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
new file mode 100644
index 000000000000..031269163bd6
--- /dev/null
+++ b/arch/x86/mach-rdc321x/gpio.c
@@ -0,0 +1,91 @@
1/*
2 * Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org>
3 * RDC321x architecture specific GPIO support
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version.
9 */
10
11#include <linux/autoconf.h>
12#include <linux/init.h>
13#include <linux/io.h>
14#include <linux/types.h>
15#include <linux/module.h>
16#include <linux/delay.h>
17
18#include <asm/mach-rdc321x/rdc321x_defs.h>
19
20static inline int rdc_gpio_is_valid(unsigned gpio)
21{
22 return (gpio <= RDC_MAX_GPIO);
23}
24
25static unsigned int rdc_gpio_read(unsigned gpio)
26{
27 unsigned int val;
28
29 val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48));
30 outl(val, RDC3210_CFGREG_ADDR);
31 udelay(10);
32 val = inl(RDC3210_CFGREG_DATA);
33 val |= (0x1 << (gpio & 0x1F));
34 outl(val, RDC3210_CFGREG_DATA);
35 udelay(10);
36 val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C));
37 outl(val, RDC3210_CFGREG_ADDR);
38 udelay(10);
39 val = inl(RDC3210_CFGREG_DATA);
40
41 return val;
42}
43
44static void rdc_gpio_write(unsigned int val)
45{
46 if (val) {
47 outl(val, RDC3210_CFGREG_DATA);
48 udelay(10);
49 }
50}
51
52int rdc_gpio_get_value(unsigned gpio)
53{
54 if (rdc_gpio_is_valid(gpio))
55 return (int)rdc_gpio_read(gpio);
56 else
57 return -EINVAL;
58}
59EXPORT_SYMBOL(rdc_gpio_get_value);
60
61void rdc_gpio_set_value(unsigned gpio, int value)
62{
63 unsigned int val;
64
65 if (!rdc_gpio_is_valid(gpio))
66 return;
67
68 val = rdc_gpio_read(gpio);
69
70 if (value)
71 val &= ~(0x1 << (gpio & 0x1F));
72 else
73 val |= (0x1 << (gpio & 0x1F));
74
75 rdc_gpio_write(val);
76}
77EXPORT_SYMBOL(rdc_gpio_set_value);
78
79int rdc_gpio_direction_input(unsigned gpio)
80{
81 return 0;
82}
83EXPORT_SYMBOL(rdc_gpio_direction_input);
84
85int rdc_gpio_direction_output(unsigned gpio, int value)
86{
87 return 0;
88}
89EXPORT_SYMBOL(rdc_gpio_direction_output);
90
91
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
new file mode 100644
index 000000000000..dda6024a5862
--- /dev/null
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -0,0 +1,68 @@
1/*
2 * Generic RDC321x platform devices
3 *
4 * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/list.h>
26#include <linux/device.h>
27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h>
30
31#include <asm/gpio.h>
32
33/* LEDS */
34static struct gpio_led default_leds[] = {
35 { .name = "rdc:dmz", .gpio = 1, },
36};
37
38static struct gpio_led_platform_data rdc321x_led_data = {
39 .num_leds = ARRAY_SIZE(default_leds),
40 .leds = default_leds,
41};
42
43static struct platform_device rdc321x_leds = {
44 .name = "leds-gpio",
45 .id = -1,
46 .dev = {
47 .platform_data = &rdc321x_led_data,
48 }
49};
50
51/* Watchdog */
52static struct platform_device rdc321x_wdt = {
53 .name = "rdc321x-wdt",
54 .id = -1,
55 .num_resources = 0,
56};
57
58static struct platform_device *rdc321x_devs[] = {
59 &rdc321x_leds,
60 &rdc321x_wdt
61};
62
63static int __init rdc_board_setup(void)
64{
65 return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
66}
67
68arch_initcall(rdc_board_setup);
diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c
new file mode 100644
index 000000000000..ec5625ae7061
--- /dev/null
+++ b/arch/x86/mach-rdc321x/wdt.c
@@ -0,0 +1,275 @@
1/*
2 * RDC321x watchdog driver
3 *
4 * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
5 *
6 * This driver is highly inspired from the cpu5_wdt driver
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/moduleparam.h>
26#include <linux/types.h>
27#include <linux/errno.h>
28#include <linux/miscdevice.h>
29#include <linux/fs.h>
30#include <linux/init.h>
31#include <linux/ioport.h>
32#include <linux/timer.h>
33#include <linux/completion.h>
34#include <linux/jiffies.h>
35#include <linux/platform_device.h>
36#include <linux/watchdog.h>
37#include <linux/io.h>
38#include <linux/uaccess.h>
39
40#include <asm/mach-rdc321x/rdc321x_defs.h>
41
42#define RDC_WDT_MASK 0x80000000 /* Mask */
43#define RDC_WDT_EN 0x00800000 /* Enable bit */
44#define RDC_WDT_WTI 0x00200000 /* Generate CPU reset/NMI/WDT on timeout */
45#define RDC_WDT_RST 0x00100000 /* Reset bit */
46#define RDC_WDT_WIF 0x00040000 /* WDT IRQ Flag */
47#define RDC_WDT_IRT 0x00000100 /* IRQ Routing table */
48#define RDC_WDT_CNT 0x00000001 /* WDT count */
49
50#define RDC_CLS_TMR 0x80003844 /* Clear timer */
51
52#define RDC_WDT_INTERVAL (HZ/10+1)
53
54int nowayout = WATCHDOG_NOWAYOUT;
55module_param(nowayout, int, 0);
56MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
57
58static int ticks = 1000;
59
60/* some device data */
61
62static struct {
63 struct completion stop;
64 volatile int running;
65 struct timer_list timer;
66 volatile int queue;
67 int default_ticks;
68 unsigned long inuse;
69} rdc321x_wdt_device;
70
71/* generic helper functions */
72
73static void rdc321x_wdt_trigger(unsigned long unused)
74{
75 if (rdc321x_wdt_device.running)
76 ticks--;
77
78 /* keep watchdog alive */
79 outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA);
80
81 /* requeue?? */
82 if (rdc321x_wdt_device.queue && ticks)
83 mod_timer(&rdc321x_wdt_device.timer,
84 jiffies + RDC_WDT_INTERVAL);
85 else {
86 /* ticks doesn't matter anyway */
87 complete(&rdc321x_wdt_device.stop);
88 }
89
90}
91
92static void rdc321x_wdt_reset(void)
93{
94 ticks = rdc321x_wdt_device.default_ticks;
95}
96
97static void rdc321x_wdt_start(void)
98{
99 if (!rdc321x_wdt_device.queue) {
100 rdc321x_wdt_device.queue = 1;
101
102 /* Clear the timer */
103 outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR);
104
105 /* Enable watchdog and set the timeout to 81.92 us */
106 outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA);
107
108 mod_timer(&rdc321x_wdt_device.timer,
109 jiffies + RDC_WDT_INTERVAL);
110 }
111
112 /* if process dies, counter is not decremented */
113 rdc321x_wdt_device.running++;
114}
115
116static int rdc321x_wdt_stop(void)
117{
118 if (rdc321x_wdt_device.running)
119 rdc321x_wdt_device.running = 0;
120
121 ticks = rdc321x_wdt_device.default_ticks;
122
123 return -EIO;
124}
125
126/* filesystem operations */
127
128static int rdc321x_wdt_open(struct inode *inode, struct file *file)
129{
130 if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
131 return -EBUSY;
132
133 return nonseekable_open(inode, file);
134}
135
136static int rdc321x_wdt_release(struct inode *inode, struct file *file)
137{
138 clear_bit(0, &rdc321x_wdt_device.inuse);
139 return 0;
140}
141
142static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file,
143 unsigned int cmd, unsigned long arg)
144{
145 void __user *argp = (void __user *)arg;
146 unsigned int value;
147 static struct watchdog_info ident = {
148 .options = WDIOF_CARDRESET,
149 .identity = "RDC321x WDT",
150 };
151
152 switch (cmd) {
153 case WDIOC_KEEPALIVE:
154 rdc321x_wdt_reset();
155 break;
156 case WDIOC_GETSTATUS:
157 /* Read the value from the DATA register */
158 value = inl(RDC3210_CFGREG_DATA);
159 if (copy_to_user(argp, &value, sizeof(int)))
160 return -EFAULT;
161 break;
162 case WDIOC_GETSUPPORT:
163 if (copy_to_user(argp, &ident, sizeof(ident)))
164 return -EFAULT;
165 break;
166 case WDIOC_SETOPTIONS:
167 if (copy_from_user(&value, argp, sizeof(int)))
168 return -EFAULT;
169 switch (value) {
170 case WDIOS_ENABLECARD:
171 rdc321x_wdt_start();
172 break;
173 case WDIOS_DISABLECARD:
174 return rdc321x_wdt_stop();
175 default:
176 return -EINVAL;
177 }
178 break;
179 default:
180 return -ENOTTY;
181 }
182 return 0;
183}
184
185static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf,
186 size_t count, loff_t *ppos)
187{
188 if (!count)
189 return -EIO;
190
191 rdc321x_wdt_reset();
192
193 return count;
194}
195
196static const struct file_operations rdc321x_wdt_fops = {
197 .owner = THIS_MODULE,
198 .llseek = no_llseek,
199 .ioctl = rdc321x_wdt_ioctl,
200 .open = rdc321x_wdt_open,
201 .write = rdc321x_wdt_write,
202 .release = rdc321x_wdt_release,
203};
204
205static struct miscdevice rdc321x_wdt_misc = {
206 .minor = WATCHDOG_MINOR,
207 .name = "watchdog",
208 .fops = &rdc321x_wdt_fops,
209};
210
211static int __devinit rdc321x_wdt_probe(struct platform_device *pdev)
212{
213 int err;
214
215 err = misc_register(&rdc321x_wdt_misc);
216 if (err < 0) {
217 printk(KERN_ERR PFX "watchdog misc_register failed\n");
218 return err;
219 }
220
221 /* Reset the watchdog */
222 outl(RDC_WDT_RST, RDC3210_CFGREG_DATA);
223
224 init_completion(&rdc321x_wdt_device.stop);
225 rdc321x_wdt_device.queue = 0;
226
227 clear_bit(0, &rdc321x_wdt_device.inuse);
228
229 setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
230
231 rdc321x_wdt_device.default_ticks = ticks;
232
233 printk(KERN_INFO PFX "watchdog init success\n");
234
235 return 0;
236}
237
238static int rdc321x_wdt_remove(struct platform_device *pdev)
239{
240 if (rdc321x_wdt_device.queue) {
241 rdc321x_wdt_device.queue = 0;
242 wait_for_completion(&rdc321x_wdt_device.stop);
243 }
244
245 misc_deregister(&rdc321x_wdt_misc);
246
247 return 0;
248}
249
250static struct platform_driver rdc321x_wdt_driver = {
251 .probe = rdc321x_wdt_probe,
252 .remove = rdc321x_wdt_remove,
253 .driver = {
254 .owner = THIS_MODULE,
255 .name = "rdc321x-wdt",
256 },
257};
258
259static int __init rdc321x_wdt_init(void)
260{
261 return platform_driver_register(&rdc321x_wdt_driver);
262}
263
264static void __exit rdc321x_wdt_exit(void)
265{
266 platform_driver_unregister(&rdc321x_wdt_driver);
267}
268
269module_init(rdc321x_wdt_init);
270module_exit(rdc321x_wdt_exit);
271
272MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
273MODULE_DESCRIPTION("RDC321x watchdog driver");
274MODULE_LICENSE("GPL");
275MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index f3c74fab8b95..2a8456a1f44f 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS;
36 36
37static void __init MP_processor_info (struct mpc_config_processor *m) 37static void __init MP_processor_info (struct mpc_config_processor *m)
38{ 38{
39 int ver, logical_apicid; 39 int ver, logical_apicid;
40 physid_mask_t apic_cpus; 40 physid_mask_t apic_cpus;
41 41
42 if (!(m->mpc_cpuflag & CPU_ENABLED)) 42 if (!(m->mpc_cpuflag & CPU_ENABLED))
43 return; 43 return;
44 44
45 logical_apicid = m->mpc_apicid; 45 logical_apicid = m->mpc_apicid;
46 printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n", 46 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
47 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", 47 m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
48 m->mpc_apicid, 48 m->mpc_apicid,
49 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 49 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
50 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 50 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
51 m->mpc_apicver); 51 m->mpc_apicver);
52 52
53 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) 53 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
54 boot_cpu_physical_apicid = m->mpc_apicid; 54 boot_cpu_physical_apicid = m->mpc_apicid;
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 3bef977cb29b..5ae5466b9eb9 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -37,14 +37,14 @@ void __init pre_setup_arch_hook(void)
37{ 37{
38 /* Voyagers run their CPUs from independent clocks, so disable 38 /* Voyagers run their CPUs from independent clocks, so disable
39 * the TSC code because we can't sync them */ 39 * the TSC code because we can't sync them */
40 tsc_disable = 1; 40 setup_clear_cpu_cap(X86_FEATURE_TSC);
41} 41}
42 42
43void __init trap_init_hook(void) 43void __init trap_init_hook(void)
44{ 44{
45} 45}
46 46
47static struct irqaction irq0 = { 47static struct irqaction irq0 = {
48 .handler = timer_interrupt, 48 .handler = timer_interrupt,
49 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, 49 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
50 .mask = CPU_MASK_NONE, 50 .mask = CPU_MASK_NONE,
@@ -59,44 +59,47 @@ void __init time_init_hook(void)
59 59
60/* Hook for machine specific memory setup. */ 60/* Hook for machine specific memory setup. */
61 61
62char * __init machine_specific_memory_setup(void) 62char *__init machine_specific_memory_setup(void)
63{ 63{
64 char *who; 64 char *who;
65 65
66 who = "NOT VOYAGER"; 66 who = "NOT VOYAGER";
67 67
68 if(voyager_level == 5) { 68 if (voyager_level == 5) {
69 __u32 addr, length; 69 __u32 addr, length;
70 int i; 70 int i;
71 71
72 who = "Voyager-SUS"; 72 who = "Voyager-SUS";
73 73
74 e820.nr_map = 0; 74 e820.nr_map = 0;
75 for(i=0; voyager_memory_detect(i, &addr, &length); i++) { 75 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
76 add_memory_region(addr, length, E820_RAM); 76 add_memory_region(addr, length, E820_RAM);
77 } 77 }
78 return who; 78 return who;
79 } else if(voyager_level == 4) { 79 } else if (voyager_level == 4) {
80 __u32 tom; 80 __u32 tom;
81 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; 81 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
82 /* select the DINO config space */ 82 /* select the DINO config space */
83 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT); 83 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
84 /* Read DINO top of memory register */ 84 /* Read DINO top of memory register */
85 tom = ((inb(catbase + 0x4) & 0xf0) << 16) 85 tom = ((inb(catbase + 0x4) & 0xf0) << 16)
86 + ((inb(catbase + 0x5) & 0x7f) << 24); 86 + ((inb(catbase + 0x5) & 0x7f) << 24);
87 87
88 if(inb(catbase) != VOYAGER_DINO) { 88 if (inb(catbase) != VOYAGER_DINO) {
89 printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); 89 printk(KERN_ERR
90 tom = (boot_params.screen_info.ext_mem_k)<<10; 90 "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
91 tom = (boot_params.screen_info.ext_mem_k) << 10;
91 } 92 }
92 who = "Voyager-TOM"; 93 who = "Voyager-TOM";
93 add_memory_region(0, 0x9f000, E820_RAM); 94 add_memory_region(0, 0x9f000, E820_RAM);
94 /* map from 1M to top of memory */ 95 /* map from 1M to top of memory */
95 add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM); 96 add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
97 E820_RAM);
96 /* FIXME: Should check the ASICs to see if I need to 98 /* FIXME: Should check the ASICs to see if I need to
97 * take out the 8M window. Just do it at the moment 99 * take out the 8M window. Just do it at the moment
98 * */ 100 * */
99 add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED); 101 add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024,
102 E820_RESERVED);
100 return who; 103 return who;
101 } 104 }
102 105
@@ -114,8 +117,7 @@ char * __init machine_specific_memory_setup(void)
114 unsigned long mem_size; 117 unsigned long mem_size;
115 118
116 /* compare results from other methods and take the greater */ 119 /* compare results from other methods and take the greater */
117 if (boot_params.alt_mem_k 120 if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
118 < boot_params.screen_info.ext_mem_k) {
119 mem_size = boot_params.screen_info.ext_mem_k; 121 mem_size = boot_params.screen_info.ext_mem_k;
120 who = "BIOS-88"; 122 who = "BIOS-88";
121 } else { 123 } else {
@@ -126,6 +128,6 @@ char * __init machine_specific_memory_setup(void)
126 e820.nr_map = 0; 128 e820.nr_map = 0;
127 add_memory_region(0, LOWMEMSIZE(), E820_RAM); 129 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
128 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); 130 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
129 } 131 }
130 return who; 132 return who;
131} 133}
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
index 9b77b39b71a6..6a949e4edde8 100644
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -35,7 +35,7 @@
35/* 35/*
36 * Power off function, if any 36 * Power off function, if any
37 */ 37 */
38void (*pm_power_off)(void); 38void (*pm_power_off) (void);
39EXPORT_SYMBOL(pm_power_off); 39EXPORT_SYMBOL(pm_power_off);
40 40
41int voyager_level = 0; 41int voyager_level = 0;
@@ -43,39 +43,38 @@ int voyager_level = 0;
43struct voyager_SUS *voyager_SUS = NULL; 43struct voyager_SUS *voyager_SUS = NULL;
44 44
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static void 46static void voyager_dump(int dummy1, struct tty_struct *dummy3)
47voyager_dump(int dummy1, struct tty_struct *dummy3)
48{ 47{
49 /* get here via a sysrq */ 48 /* get here via a sysrq */
50 voyager_smp_dump(); 49 voyager_smp_dump();
51} 50}
52 51
53static struct sysrq_key_op sysrq_voyager_dump_op = { 52static struct sysrq_key_op sysrq_voyager_dump_op = {
54 .handler = voyager_dump, 53 .handler = voyager_dump,
55 .help_msg = "Voyager", 54 .help_msg = "Voyager",
56 .action_msg = "Dump Voyager Status", 55 .action_msg = "Dump Voyager Status",
57}; 56};
58#endif 57#endif
59 58
60void 59void voyager_detect(struct voyager_bios_info *bios)
61voyager_detect(struct voyager_bios_info *bios)
62{ 60{
63 if(bios->len != 0xff) { 61 if (bios->len != 0xff) {
64 int class = (bios->class_1 << 8) 62 int class = (bios->class_1 << 8)
65 | (bios->class_2 & 0xff); 63 | (bios->class_2 & 0xff);
66 64
67 printk("Voyager System detected.\n" 65 printk("Voyager System detected.\n"
68 " Class %x, Revision %d.%d\n", 66 " Class %x, Revision %d.%d\n",
69 class, bios->major, bios->minor); 67 class, bios->major, bios->minor);
70 if(class == VOYAGER_LEVEL4) 68 if (class == VOYAGER_LEVEL4)
71 voyager_level = 4; 69 voyager_level = 4;
72 else if(class < VOYAGER_LEVEL5_AND_ABOVE) 70 else if (class < VOYAGER_LEVEL5_AND_ABOVE)
73 voyager_level = 3; 71 voyager_level = 3;
74 else 72 else
75 voyager_level = 5; 73 voyager_level = 5;
76 printk(" Architecture Level %d\n", voyager_level); 74 printk(" Architecture Level %d\n", voyager_level);
77 if(voyager_level < 4) 75 if (voyager_level < 4)
78 printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); 76 printk
77 ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
79 /* install the power off handler */ 78 /* install the power off handler */
80 pm_power_off = voyager_power_off; 79 pm_power_off = voyager_power_off;
81#ifdef CONFIG_SMP 80#ifdef CONFIG_SMP
@@ -86,15 +85,13 @@ voyager_detect(struct voyager_bios_info *bios)
86 } 85 }
87} 86}
88 87
89void 88void voyager_system_interrupt(int cpl, void *dev_id)
90voyager_system_interrupt(int cpl, void *dev_id)
91{ 89{
92 printk("Voyager: detected system interrupt\n"); 90 printk("Voyager: detected system interrupt\n");
93} 91}
94 92
95/* Routine to read information from the extended CMOS area */ 93/* Routine to read information from the extended CMOS area */
96__u8 94__u8 voyager_extended_cmos_read(__u16 addr)
97voyager_extended_cmos_read(__u16 addr)
98{ 95{
99 outb(addr & 0xff, 0x74); 96 outb(addr & 0xff, 0x74);
100 outb((addr >> 8) & 0xff, 0x75); 97 outb((addr >> 8) & 0xff, 0x75);
@@ -108,12 +105,11 @@ voyager_extended_cmos_read(__u16 addr)
108 105
109typedef struct ClickMap { 106typedef struct ClickMap {
110 struct Entry { 107 struct Entry {
111 __u32 Address; 108 __u32 Address;
112 __u32 Length; 109 __u32 Length;
113 } Entry[CLICK_ENTRIES]; 110 } Entry[CLICK_ENTRIES];
114} ClickMap_t; 111} ClickMap_t;
115 112
116
117/* This routine is pretty much an awful hack to read the bios clickmap by 113/* This routine is pretty much an awful hack to read the bios clickmap by
118 * mapping it into page 0. There are usually three regions in the map: 114 * mapping it into page 0. There are usually three regions in the map:
119 * Base Memory 115 * Base Memory
@@ -122,8 +118,7 @@ typedef struct ClickMap {
122 * 118 *
123 * Returns are 0 for failure and 1 for success on extracting region. 119 * Returns are 0 for failure and 1 for success on extracting region.
124 */ 120 */
125int __init 121int __init voyager_memory_detect(int region, __u32 * start, __u32 * length)
126voyager_memory_detect(int region, __u32 *start, __u32 *length)
127{ 122{
128 int i; 123 int i;
129 int retval = 0; 124 int retval = 0;
@@ -132,13 +127,14 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
132 unsigned long map_addr; 127 unsigned long map_addr;
133 unsigned long old; 128 unsigned long old;
134 129
135 if(region >= CLICK_ENTRIES) { 130 if (region >= CLICK_ENTRIES) {
136 printk("Voyager: Illegal ClickMap region %d\n", region); 131 printk("Voyager: Illegal ClickMap region %d\n", region);
137 return 0; 132 return 0;
138 } 133 }
139 134
140 for(i = 0; i < sizeof(cmos); i++) 135 for (i = 0; i < sizeof(cmos); i++)
141 cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); 136 cmos[i] =
137 voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
142 138
143 map_addr = *(unsigned long *)cmos; 139 map_addr = *(unsigned long *)cmos;
144 140
@@ -147,10 +143,10 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
147 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); 143 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
148 local_flush_tlb(); 144 local_flush_tlb();
149 /* now clear everything out but page 0 */ 145 /* now clear everything out but page 0 */
150 map = (ClickMap_t *)(map_addr & (~PAGE_MASK)); 146 map = (ClickMap_t *) (map_addr & (~PAGE_MASK));
151 147
152 /* zero length is the end of the clickmap */ 148 /* zero length is the end of the clickmap */
153 if(map->Entry[region].Length != 0) { 149 if (map->Entry[region].Length != 0) {
154 *length = map->Entry[region].Length * CLICK_SIZE; 150 *length = map->Entry[region].Length * CLICK_SIZE;
155 *start = map->Entry[region].Address; 151 *start = map->Entry[region].Address;
156 retval = 1; 152 retval = 1;
@@ -165,10 +161,9 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
165/* voyager specific handling code for timer interrupts. Used to hand 161/* voyager specific handling code for timer interrupts. Used to hand
166 * off the timer tick to the SMP code, since the VIC doesn't have an 162 * off the timer tick to the SMP code, since the VIC doesn't have an
167 * internal timer (The QIC does, but that's another story). */ 163 * internal timer (The QIC does, but that's another story). */
168void 164void voyager_timer_interrupt(void)
169voyager_timer_interrupt(void)
170{ 165{
171 if((jiffies & 0x3ff) == 0) { 166 if ((jiffies & 0x3ff) == 0) {
172 167
173 /* There seems to be something flaky in either 168 /* There seems to be something flaky in either
174 * hardware or software that is resetting the timer 0 169 * hardware or software that is resetting the timer 0
@@ -186,18 +181,20 @@ voyager_timer_interrupt(void)
186 __u16 val; 181 __u16 val;
187 182
188 spin_lock(&i8253_lock); 183 spin_lock(&i8253_lock);
189 184
190 outb_p(0x00, 0x43); 185 outb_p(0x00, 0x43);
191 val = inb_p(0x40); 186 val = inb_p(0x40);
192 val |= inb(0x40) << 8; 187 val |= inb(0x40) << 8;
193 spin_unlock(&i8253_lock); 188 spin_unlock(&i8253_lock);
194 189
195 if(val > LATCH) { 190 if (val > LATCH) {
196 printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val); 191 printk
192 ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n",
193 val);
197 spin_lock(&i8253_lock); 194 spin_lock(&i8253_lock);
198 outb(0x34,0x43); 195 outb(0x34, 0x43);
199 outb_p(LATCH & 0xff , 0x40); /* LSB */ 196 outb_p(LATCH & 0xff, 0x40); /* LSB */
200 outb(LATCH >> 8 , 0x40); /* MSB */ 197 outb(LATCH >> 8, 0x40); /* MSB */
201 spin_unlock(&i8253_lock); 198 spin_unlock(&i8253_lock);
202 } 199 }
203 } 200 }
@@ -206,14 +203,13 @@ voyager_timer_interrupt(void)
206#endif 203#endif
207} 204}
208 205
209void 206void voyager_power_off(void)
210voyager_power_off(void)
211{ 207{
212 printk("VOYAGER Power Off\n"); 208 printk("VOYAGER Power Off\n");
213 209
214 if(voyager_level == 5) { 210 if (voyager_level == 5) {
215 voyager_cat_power_off(); 211 voyager_cat_power_off();
216 } else if(voyager_level == 4) { 212 } else if (voyager_level == 4) {
217 /* This doesn't apparently work on most L4 machines, 213 /* This doesn't apparently work on most L4 machines,
218 * but the specs say to do this to get automatic power 214 * but the specs say to do this to get automatic power
219 * off. Unfortunately, if it doesn't power off the 215 * off. Unfortunately, if it doesn't power off the
@@ -222,10 +218,8 @@ voyager_power_off(void)
222#if 0 218#if 0
223 int port; 219 int port;
224 220
225
226 /* enable the voyager Configuration Space */ 221 /* enable the voyager Configuration Space */
227 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, 222 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP);
228 VOYAGER_MC_SETUP);
229 /* the port for the power off flag is an offset from the 223 /* the port for the power off flag is an offset from the
230 floating base */ 224 floating base */
231 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; 225 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
@@ -235,62 +229,57 @@ voyager_power_off(void)
235 } 229 }
236 /* and wait for it to happen */ 230 /* and wait for it to happen */
237 local_irq_disable(); 231 local_irq_disable();
238 for(;;) 232 for (;;)
239 halt(); 233 halt();
240} 234}
241 235
242/* copied from process.c */ 236/* copied from process.c */
243static inline void 237static inline void kb_wait(void)
244kb_wait(void)
245{ 238{
246 int i; 239 int i;
247 240
248 for (i=0; i<0x10000; i++) 241 for (i = 0; i < 0x10000; i++)
249 if ((inb_p(0x64) & 0x02) == 0) 242 if ((inb_p(0x64) & 0x02) == 0)
250 break; 243 break;
251} 244}
252 245
253void 246void machine_shutdown(void)
254machine_shutdown(void)
255{ 247{
256 /* Architecture specific shutdown needed before a kexec */ 248 /* Architecture specific shutdown needed before a kexec */
257} 249}
258 250
259void 251void machine_restart(char *cmd)
260machine_restart(char *cmd)
261{ 252{
262 printk("Voyager Warm Restart\n"); 253 printk("Voyager Warm Restart\n");
263 kb_wait(); 254 kb_wait();
264 255
265 if(voyager_level == 5) { 256 if (voyager_level == 5) {
266 /* write magic values to the RTC to inform system that 257 /* write magic values to the RTC to inform system that
267 * shutdown is beginning */ 258 * shutdown is beginning */
268 outb(0x8f, 0x70); 259 outb(0x8f, 0x70);
269 outb(0x5 , 0x71); 260 outb(0x5, 0x71);
270 261
271 udelay(50); 262 udelay(50);
272 outb(0xfe,0x64); /* pull reset low */ 263 outb(0xfe, 0x64); /* pull reset low */
273 } else if(voyager_level == 4) { 264 } else if (voyager_level == 4) {
274 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; 265 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
275 __u8 basebd = inb(VOYAGER_MC_SETUP); 266 __u8 basebd = inb(VOYAGER_MC_SETUP);
276 267
277 outb(basebd | 0x08, VOYAGER_MC_SETUP); 268 outb(basebd | 0x08, VOYAGER_MC_SETUP);
278 outb(0x02, catbase + 0x21); 269 outb(0x02, catbase + 0x21);
279 } 270 }
280 local_irq_disable(); 271 local_irq_disable();
281 for(;;) 272 for (;;)
282 halt(); 273 halt();
283} 274}
284 275
285void 276void machine_emergency_restart(void)
286machine_emergency_restart(void)
287{ 277{
288 /*for now, just hook this to a warm restart */ 278 /*for now, just hook this to a warm restart */
289 machine_restart(NULL); 279 machine_restart(NULL);
290} 280}
291 281
292void 282void mca_nmi_hook(void)
293mca_nmi_hook(void)
294{ 283{
295 __u8 dumpval __maybe_unused = inb(0xf823); 284 __u8 dumpval __maybe_unused = inb(0xf823);
296 __u8 swnmi __maybe_unused = inb(0xf813); 285 __u8 swnmi __maybe_unused = inb(0xf813);
@@ -301,8 +290,8 @@ mca_nmi_hook(void)
301 /* clear swnmi */ 290 /* clear swnmi */
302 outb(0xff, 0xf813); 291 outb(0xff, 0xf813);
303 /* tell SUS to ignore dump */ 292 /* tell SUS to ignore dump */
304 if(voyager_level == 5 && voyager_SUS != NULL) { 293 if (voyager_level == 5 && voyager_SUS != NULL) {
305 if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { 294 if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
306 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; 295 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
307 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; 296 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
308 udelay(1000); 297 udelay(1000);
@@ -310,15 +299,14 @@ mca_nmi_hook(void)
310 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; 299 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
311 } 300 }
312 } 301 }
313 printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id()); 302 printk(KERN_ERR
303 "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n",
304 smp_processor_id());
314 show_stack(NULL, NULL); 305 show_stack(NULL, NULL);
315 show_state(); 306 show_state();
316} 307}
317 308
318 309void machine_halt(void)
319
320void
321machine_halt(void)
322{ 310{
323 /* treat a halt like a power off */ 311 /* treat a halt like a power off */
324 machine_power_off(); 312 machine_power_off();
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
index 2132ca652df1..17a7904f75b1 100644
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -39,34 +39,32 @@
39#define CAT_DATA (sspb + 0xd) 39#define CAT_DATA (sspb + 0xd)
40 40
41/* the internal cat functions */ 41/* the internal cat functions */
42static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, 42static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits);
43 __u16 num_bits); 43static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data,
44static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data,
45 __u16 num_bits); 44 __u16 num_bits);
46static void cat_build_header(__u8 *header, const __u16 len, 45static void cat_build_header(__u8 * header, const __u16 len,
47 const __u16 smallest_reg_bits, 46 const __u16 smallest_reg_bits,
48 const __u16 longest_reg_bits); 47 const __u16 longest_reg_bits);
49static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, 48static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp,
50 __u8 reg, __u8 op); 49 __u8 reg, __u8 op);
51static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, 50static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp,
52 __u8 reg, __u8 *value); 51 __u8 reg, __u8 * value);
53static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, 52static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes,
54 __u8 pad_bits); 53 __u8 pad_bits);
55static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 54static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
56 __u8 value); 55 __u8 value);
57static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 56static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
58 __u8 *value); 57 __u8 * value);
59static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, 58static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp,
60 __u16 offset, __u16 len, void *buf); 59 __u16 offset, __u16 len, void *buf);
61static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 60static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
62 __u8 reg, __u8 value); 61 __u8 reg, __u8 value);
63static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp); 62static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp);
64static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp); 63static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp);
65 64
66static inline const char * 65static inline const char *cat_module_name(int module_id)
67cat_module_name(int module_id)
68{ 66{
69 switch(module_id) { 67 switch (module_id) {
70 case 0x10: 68 case 0x10:
71 return "Processor Slot 0"; 69 return "Processor Slot 0";
72 case 0x11: 70 case 0x11:
@@ -105,14 +103,14 @@ voyager_module_t *voyager_cat_list;
105 103
106/* the I/O port assignments for the VIC and QIC */ 104/* the I/O port assignments for the VIC and QIC */
107static struct resource vic_res = { 105static struct resource vic_res = {
108 .name = "Voyager Interrupt Controller", 106 .name = "Voyager Interrupt Controller",
109 .start = 0xFC00, 107 .start = 0xFC00,
110 .end = 0xFC6F 108 .end = 0xFC6F
111}; 109};
112static struct resource qic_res = { 110static struct resource qic_res = {
113 .name = "Quad Interrupt Controller", 111 .name = "Quad Interrupt Controller",
114 .start = 0xFC70, 112 .start = 0xFC70,
115 .end = 0xFCFF 113 .end = 0xFCFF
116}; 114};
117 115
118/* This function is used to pack a data bit stream inside a message. 116/* This function is used to pack a data bit stream inside a message.
@@ -120,7 +118,7 @@ static struct resource qic_res = {
120 * Note: This function assumes that any unused bit in the data stream 118 * Note: This function assumes that any unused bit in the data stream
121 * is set to zero so that the ors will work correctly */ 119 * is set to zero so that the ors will work correctly */
122static void 120static void
123cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) 121cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
124{ 122{
125 /* compute initial shift needed */ 123 /* compute initial shift needed */
126 const __u16 offset = start_bit % BITS_PER_BYTE; 124 const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -130,7 +128,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
130 int i; 128 int i;
131 129
132 /* adjust if we have more than a byte of residue */ 130 /* adjust if we have more than a byte of residue */
133 if(residue >= BITS_PER_BYTE) { 131 if (residue >= BITS_PER_BYTE) {
134 residue -= BITS_PER_BYTE; 132 residue -= BITS_PER_BYTE;
135 len++; 133 len++;
136 } 134 }
@@ -138,24 +136,25 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
138 /* clear out the bits. We assume here that if len==0 then 136 /* clear out the bits. We assume here that if len==0 then
139 * residue >= offset. This is always true for the catbus 137 * residue >= offset. This is always true for the catbus
140 * operations */ 138 * operations */
141 msg[byte] &= 0xff << (BITS_PER_BYTE - offset); 139 msg[byte] &= 0xff << (BITS_PER_BYTE - offset);
142 msg[byte++] |= data[0] >> offset; 140 msg[byte++] |= data[0] >> offset;
143 if(len == 0) 141 if (len == 0)
144 return; 142 return;
145 for(i = 1; i < len; i++) 143 for (i = 1; i < len; i++)
146 msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset)) 144 msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset))
147 | (data[i] >> offset); 145 | (data[i] >> offset);
148 if(residue != 0) { 146 if (residue != 0) {
149 __u8 mask = 0xff >> residue; 147 __u8 mask = 0xff >> residue;
150 __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset) 148 __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset)
151 | (data[i] >> offset); 149 | (data[i] >> offset);
152 150
153 last_byte &= ~mask; 151 last_byte &= ~mask;
154 msg[byte] &= mask; 152 msg[byte] &= mask;
155 msg[byte] |= last_byte; 153 msg[byte] |= last_byte;
156 } 154 }
157 return; 155 return;
158} 156}
157
159/* unpack the data again (same arguments as cat_pack()). data buffer 158/* unpack the data again (same arguments as cat_pack()). data buffer
160 * must be zero populated. 159 * must be zero populated.
161 * 160 *
@@ -163,7 +162,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
163 * data (starting at bit 0 in data). 162 * data (starting at bit 0 in data).
164 */ 163 */
165static void 164static void
166cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) 165cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
167{ 166{
168 /* compute initial shift needed */ 167 /* compute initial shift needed */
169 const __u16 offset = start_bit % BITS_PER_BYTE; 168 const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -172,97 +171,97 @@ cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
172 __u16 byte = start_bit / BITS_PER_BYTE; 171 __u16 byte = start_bit / BITS_PER_BYTE;
173 int i; 172 int i;
174 173
175 if(last_bits != 0) 174 if (last_bits != 0)
176 len++; 175 len++;
177 176
178 /* special case: want < 8 bits from msg and we can get it from 177 /* special case: want < 8 bits from msg and we can get it from
179 * a single byte of the msg */ 178 * a single byte of the msg */
180 if(len == 0 && BITS_PER_BYTE - offset >= num_bits) { 179 if (len == 0 && BITS_PER_BYTE - offset >= num_bits) {
181 data[0] = msg[byte] << offset; 180 data[0] = msg[byte] << offset;
182 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); 181 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
183 return; 182 return;
184 } 183 }
185 for(i = 0; i < len; i++) { 184 for (i = 0; i < len; i++) {
186 /* this annoying if has to be done just in case a read of 185 /* this annoying if has to be done just in case a read of
187 * msg one beyond the array causes a panic */ 186 * msg one beyond the array causes a panic */
188 if(offset != 0) { 187 if (offset != 0) {
189 data[i] = msg[byte++] << offset; 188 data[i] = msg[byte++] << offset;
190 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); 189 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
191 } 190 } else {
192 else {
193 data[i] = msg[byte++]; 191 data[i] = msg[byte++];
194 } 192 }
195 } 193 }
196 /* do we need to truncate the final byte */ 194 /* do we need to truncate the final byte */
197 if(last_bits != 0) { 195 if (last_bits != 0) {
198 data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits); 196 data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits);
199 } 197 }
200 return; 198 return;
201} 199}
202 200
203static void 201static void
204cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits, 202cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits,
205 const __u16 longest_reg_bits) 203 const __u16 longest_reg_bits)
206{ 204{
207 int i; 205 int i;
208 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; 206 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
209 __u8 *last_byte = &header[len - 1]; 207 __u8 *last_byte = &header[len - 1];
210 208
211 if(start_bit == 0) 209 if (start_bit == 0)
212 start_bit = 1; /* must have at least one bit in the hdr */ 210 start_bit = 1; /* must have at least one bit in the hdr */
213 211
214 for(i=0; i < len; i++) 212 for (i = 0; i < len; i++)
215 header[i] = 0; 213 header[i] = 0;
216 214
217 for(i = start_bit; i > 0; i--) 215 for (i = start_bit; i > 0; i--)
218 *last_byte = ((*last_byte) << 1) + 1; 216 *last_byte = ((*last_byte) << 1) + 1;
219 217
220} 218}
221 219
222static int 220static int
223cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) 221cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op)
224{ 222{
225 __u8 parity, inst, inst_buf[4] = { 0 }; 223 __u8 parity, inst, inst_buf[4] = { 0 };
226 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; 224 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
227 __u16 ibytes, hbytes, padbits; 225 __u16 ibytes, hbytes, padbits;
228 int i; 226 int i;
229 227
230 /* 228 /*
231 * Parity is the parity of the register number + 1 (READ_REGISTER 229 * Parity is the parity of the register number + 1 (READ_REGISTER
232 * and WRITE_REGISTER always add '1' to the number of bits == 1) 230 * and WRITE_REGISTER always add '1' to the number of bits == 1)
233 */ 231 */
234 parity = (__u8)(1 + (reg & 0x01) + 232 parity = (__u8) (1 + (reg & 0x01) +
235 ((__u8)(reg & 0x02) >> 1) + 233 ((__u8) (reg & 0x02) >> 1) +
236 ((__u8)(reg & 0x04) >> 2) + 234 ((__u8) (reg & 0x04) >> 2) +
237 ((__u8)(reg & 0x08) >> 3)) % 2; 235 ((__u8) (reg & 0x08) >> 3)) % 2;
238 236
239 inst = ((parity << 7) | (reg << 2) | op); 237 inst = ((parity << 7) | (reg << 2) | op);
240 238
241 outb(VOYAGER_CAT_IRCYC, CAT_CMD); 239 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
242 if(!modp->scan_path_connected) { 240 if (!modp->scan_path_connected) {
243 if(asicp->asic_id != VOYAGER_CAT_ID) { 241 if (asicp->asic_id != VOYAGER_CAT_ID) {
244 printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); 242 printk
243 ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
245 return 1; 244 return 1;
246 } 245 }
247 outb(VOYAGER_CAT_HEADER, CAT_DATA); 246 outb(VOYAGER_CAT_HEADER, CAT_DATA);
248 outb(inst, CAT_DATA); 247 outb(inst, CAT_DATA);
249 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 248 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
250 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); 249 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
251 return 1; 250 return 1;
252 } 251 }
253 return 0; 252 return 0;
254 } 253 }
255 ibytes = modp->inst_bits / BITS_PER_BYTE; 254 ibytes = modp->inst_bits / BITS_PER_BYTE;
256 if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { 255 if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
257 padbits = BITS_PER_BYTE - padbits; 256 padbits = BITS_PER_BYTE - padbits;
258 ibytes++; 257 ibytes++;
259 } 258 }
260 hbytes = modp->largest_reg / BITS_PER_BYTE; 259 hbytes = modp->largest_reg / BITS_PER_BYTE;
261 if(modp->largest_reg % BITS_PER_BYTE) 260 if (modp->largest_reg % BITS_PER_BYTE)
262 hbytes++; 261 hbytes++;
263 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); 262 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
264 /* initialise the instruction sequence to 0xff */ 263 /* initialise the instruction sequence to 0xff */
265 for(i=0; i < ibytes + hbytes; i++) 264 for (i = 0; i < ibytes + hbytes; i++)
266 iseq[i] = 0xff; 265 iseq[i] = 0xff;
267 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); 266 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
268 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); 267 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
@@ -271,11 +270,11 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
271 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); 270 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
272#ifdef VOYAGER_CAT_DEBUG 271#ifdef VOYAGER_CAT_DEBUG
273 printk("ins = 0x%x, iseq: ", inst); 272 printk("ins = 0x%x, iseq: ", inst);
274 for(i=0; i< ibytes + hbytes; i++) 273 for (i = 0; i < ibytes + hbytes; i++)
275 printk("0x%x ", iseq[i]); 274 printk("0x%x ", iseq[i]);
276 printk("\n"); 275 printk("\n");
277#endif 276#endif
278 if(cat_shiftout(iseq, ibytes, hbytes, padbits)) { 277 if (cat_shiftout(iseq, ibytes, hbytes, padbits)) {
279 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); 278 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
280 return 1; 279 return 1;
281 } 280 }
@@ -284,72 +283,74 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
284} 283}
285 284
286static int 285static int
287cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 286cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
288 __u8 *value) 287 __u8 * value)
289{ 288{
290 if(!modp->scan_path_connected) { 289 if (!modp->scan_path_connected) {
291 if(asicp->asic_id != VOYAGER_CAT_ID) { 290 if (asicp->asic_id != VOYAGER_CAT_ID) {
292 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); 291 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
293 return 1; 292 return 1;
294 } 293 }
295 if(reg > VOYAGER_SUBADDRHI) 294 if (reg > VOYAGER_SUBADDRHI)
296 outb(VOYAGER_CAT_RUN, CAT_CMD); 295 outb(VOYAGER_CAT_RUN, CAT_CMD);
297 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 296 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
298 outb(VOYAGER_CAT_HEADER, CAT_DATA); 297 outb(VOYAGER_CAT_HEADER, CAT_DATA);
299 *value = inb(CAT_DATA); 298 *value = inb(CAT_DATA);
300 outb(0xAA, CAT_DATA); 299 outb(0xAA, CAT_DATA);
301 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 300 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
302 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); 301 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
303 return 1; 302 return 1;
304 } 303 }
305 return 0; 304 return 0;
306 } 305 } else {
307 else { 306 __u16 sbits = modp->num_asics - 1 + asicp->ireg_length;
308 __u16 sbits = modp->num_asics -1 + asicp->ireg_length;
309 __u16 sbytes = sbits / BITS_PER_BYTE; 307 __u16 sbytes = sbits / BITS_PER_BYTE;
310 __u16 tbytes; 308 __u16 tbytes;
311 __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE]; 309 __u8 string[VOYAGER_MAX_SCAN_PATH],
310 trailer[VOYAGER_MAX_REG_SIZE];
312 __u8 padbits; 311 __u8 padbits;
313 int i; 312 int i;
314 313
315 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 314 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
316 315
317 if((padbits = sbits % BITS_PER_BYTE) != 0) { 316 if ((padbits = sbits % BITS_PER_BYTE) != 0) {
318 padbits = BITS_PER_BYTE - padbits; 317 padbits = BITS_PER_BYTE - padbits;
319 sbytes++; 318 sbytes++;
320 } 319 }
321 tbytes = asicp->ireg_length / BITS_PER_BYTE; 320 tbytes = asicp->ireg_length / BITS_PER_BYTE;
322 if(asicp->ireg_length % BITS_PER_BYTE) 321 if (asicp->ireg_length % BITS_PER_BYTE)
323 tbytes++; 322 tbytes++;
324 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", 323 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
325 tbytes, sbytes, padbits)); 324 tbytes, sbytes, padbits));
326 cat_build_header(trailer, tbytes, 1, asicp->ireg_length); 325 cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
327 326
328 327 for (i = tbytes - 1; i >= 0; i--) {
329 for(i = tbytes - 1; i >= 0; i--) {
330 outb(trailer[i], CAT_DATA); 328 outb(trailer[i], CAT_DATA);
331 string[sbytes + i] = inb(CAT_DATA); 329 string[sbytes + i] = inb(CAT_DATA);
332 } 330 }
333 331
334 for(i = sbytes - 1; i >= 0; i--) { 332 for (i = sbytes - 1; i >= 0; i--) {
335 outb(0xaa, CAT_DATA); 333 outb(0xaa, CAT_DATA);
336 string[i] = inb(CAT_DATA); 334 string[i] = inb(CAT_DATA);
337 } 335 }
338 *value = 0; 336 *value = 0;
339 cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length); 337 cat_unpack(string,
338 padbits + (tbytes * BITS_PER_BYTE) +
339 asicp->asic_location, value, asicp->ireg_length);
340#ifdef VOYAGER_CAT_DEBUG 340#ifdef VOYAGER_CAT_DEBUG
341 printk("value=0x%x, string: ", *value); 341 printk("value=0x%x, string: ", *value);
342 for(i=0; i< tbytes+sbytes; i++) 342 for (i = 0; i < tbytes + sbytes; i++)
343 printk("0x%x ", string[i]); 343 printk("0x%x ", string[i]);
344 printk("\n"); 344 printk("\n");
345#endif 345#endif
346 346
347 /* sanity check the rest of the return */ 347 /* sanity check the rest of the return */
348 for(i=0; i < tbytes; i++) { 348 for (i = 0; i < tbytes; i++) {
349 __u8 input = 0; 349 __u8 input = 0;
350 350
351 cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE); 351 cat_unpack(string, padbits + (i * BITS_PER_BYTE),
352 if(trailer[i] != input) { 352 &input, BITS_PER_BYTE);
353 if (trailer[i] != input) {
353 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); 354 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
354 return 1; 355 return 1;
355 } 356 }
@@ -360,14 +361,14 @@ cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
360} 361}
361 362
362static int 363static int
363cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) 364cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
364{ 365{
365 int i; 366 int i;
366 367
367 for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--) 368 for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
368 outb(data[i], CAT_DATA); 369 outb(data[i], CAT_DATA);
369 370
370 for(i = header_bytes - 1; i >= 0; i--) { 371 for (i = header_bytes - 1; i >= 0; i--) {
371 __u8 header = 0; 372 __u8 header = 0;
372 __u8 input; 373 __u8 input;
373 374
@@ -376,7 +377,7 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
376 CDEBUG(("cat_shiftout: returned 0x%x\n", input)); 377 CDEBUG(("cat_shiftout: returned 0x%x\n", input));
377 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, 378 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
378 &header, BITS_PER_BYTE); 379 &header, BITS_PER_BYTE);
379 if(input != header) { 380 if (input != header) {
380 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); 381 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
381 return 1; 382 return 1;
382 } 383 }
@@ -385,57 +386,57 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
385} 386}
386 387
387static int 388static int
388cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 389cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
389 __u8 reg, __u8 value) 390 __u8 reg, __u8 value)
390{ 391{
391 outb(VOYAGER_CAT_DRCYC, CAT_CMD); 392 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
392 if(!modp->scan_path_connected) { 393 if (!modp->scan_path_connected) {
393 if(asicp->asic_id != VOYAGER_CAT_ID) { 394 if (asicp->asic_id != VOYAGER_CAT_ID) {
394 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); 395 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
395 return 1; 396 return 1;
396 } 397 }
397 outb(VOYAGER_CAT_HEADER, CAT_DATA); 398 outb(VOYAGER_CAT_HEADER, CAT_DATA);
398 outb(value, CAT_DATA); 399 outb(value, CAT_DATA);
399 if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { 400 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
400 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); 401 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
401 return 1; 402 return 1;
402 } 403 }
403 if(reg > VOYAGER_SUBADDRHI) { 404 if (reg > VOYAGER_SUBADDRHI) {
404 outb(VOYAGER_CAT_RUN, CAT_CMD); 405 outb(VOYAGER_CAT_RUN, CAT_CMD);
405 outb(VOYAGER_CAT_END, CAT_CMD); 406 outb(VOYAGER_CAT_END, CAT_CMD);
406 outb(VOYAGER_CAT_RUN, CAT_CMD); 407 outb(VOYAGER_CAT_RUN, CAT_CMD);
407 } 408 }
408 409
409 return 0; 410 return 0;
410 } 411 } else {
411 else {
412 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; 412 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
413 __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE; 413 __u16 dbytes =
414 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], 414 (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE;
415 hseq[VOYAGER_MAX_REG_SIZE]; 415 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH],
416 hseq[VOYAGER_MAX_REG_SIZE];
416 int i; 417 int i;
417 418
418 if((padbits = (modp->num_asics - 1 419 if ((padbits = (modp->num_asics - 1
419 + asicp->ireg_length) % BITS_PER_BYTE) != 0) { 420 + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
420 padbits = BITS_PER_BYTE - padbits; 421 padbits = BITS_PER_BYTE - padbits;
421 dbytes++; 422 dbytes++;
422 } 423 }
423 if(asicp->ireg_length % BITS_PER_BYTE) 424 if (asicp->ireg_length % BITS_PER_BYTE)
424 hbytes++; 425 hbytes++;
425 426
426 cat_build_header(hseq, hbytes, 1, asicp->ireg_length); 427 cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
427 428
428 for(i = 0; i < dbytes + hbytes; i++) 429 for (i = 0; i < dbytes + hbytes; i++)
429 dseq[i] = 0xff; 430 dseq[i] = 0xff;
430 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", 431 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
431 dbytes, hbytes, padbits)); 432 dbytes, hbytes, padbits));
432 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, 433 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
433 hseq, hbytes * BITS_PER_BYTE); 434 hseq, hbytes * BITS_PER_BYTE);
434 cat_pack(dseq, asicp->asic_location, &value, 435 cat_pack(dseq, asicp->asic_location, &value,
435 asicp->ireg_length); 436 asicp->ireg_length);
436#ifdef VOYAGER_CAT_DEBUG 437#ifdef VOYAGER_CAT_DEBUG
437 printk("dseq "); 438 printk("dseq ");
438 for(i=0; i<hbytes+dbytes; i++) { 439 for (i = 0; i < hbytes + dbytes; i++) {
439 printk("0x%x ", dseq[i]); 440 printk("0x%x ", dseq[i]);
440 } 441 }
441 printk("\n"); 442 printk("\n");
@@ -445,121 +446,125 @@ cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
445} 446}
446 447
447static int 448static int
448cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 449cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value)
449 __u8 value)
450{ 450{
451 if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) 451 if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
452 return 1; 452 return 1;
453 return cat_senddata(modp, asicp, reg, value); 453 return cat_senddata(modp, asicp, reg, value);
454} 454}
455 455
456static int 456static int
457cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 457cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
458 __u8 *value) 458 __u8 * value)
459{ 459{
460 if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) 460 if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
461 return 1; 461 return 1;
462 return cat_getdata(modp, asicp, reg, value); 462 return cat_getdata(modp, asicp, reg, value);
463} 463}
464 464
465static int 465static int
466cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 466cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
467 __u16 len) 467 __u16 len)
468{ 468{
469 __u8 val; 469 __u8 val;
470 470
471 if(len > 1) { 471 if (len > 1) {
472 /* set auto increment */ 472 /* set auto increment */
473 __u8 newval; 473 __u8 newval;
474 474
475 if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { 475 if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
476 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); 476 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
477 return 1; 477 return 1;
478 } 478 }
479 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val)); 479 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n",
480 val));
480 newval = val | VOYAGER_AUTO_INC; 481 newval = val | VOYAGER_AUTO_INC;
481 if(newval != val) { 482 if (newval != val) {
482 if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { 483 if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
483 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); 484 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
484 return 1; 485 return 1;
485 } 486 }
486 } 487 }
487 } 488 }
488 if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) { 489 if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) {
489 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); 490 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
490 return 1; 491 return 1;
491 } 492 }
492 if(asicp->subaddr > VOYAGER_SUBADDR_LO) { 493 if (asicp->subaddr > VOYAGER_SUBADDR_LO) {
493 if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) { 494 if (cat_write
495 (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) {
494 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); 496 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
495 return 1; 497 return 1;
496 } 498 }
497 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); 499 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
498 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val)); 500 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset,
501 val));
499 } 502 }
500 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); 503 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
501 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); 504 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
502 return 0; 505 return 0;
503} 506}
504 507
505static int 508static int
506cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 509cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
507 __u16 len, void *buf) 510 __u16 len, void *buf)
508{ 511{
509 int i, retval; 512 int i, retval;
510 513
511 /* FIXME: need special actions for VOYAGER_CAT_ID here */ 514 /* FIXME: need special actions for VOYAGER_CAT_ID here */
512 if(asicp->asic_id == VOYAGER_CAT_ID) { 515 if (asicp->asic_id == VOYAGER_CAT_ID) {
513 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); 516 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
514 /* FIXME -- This is supposed to be handled better 517 /* FIXME -- This is supposed to be handled better
515 * There is a problem writing to the cat asic in the 518 * There is a problem writing to the cat asic in the
516 * PSI. The 30us delay seems to work, though */ 519 * PSI. The 30us delay seems to work, though */
517 udelay(30); 520 udelay(30);
518 } 521 }
519 522
520 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { 523 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
521 printk("cat_subwrite: cat_subaddrsetup FAILED\n"); 524 printk("cat_subwrite: cat_subaddrsetup FAILED\n");
522 return retval; 525 return retval;
523 } 526 }
524 527
525 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { 528 if (cat_sendinst
529 (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
526 printk("cat_subwrite: cat_sendinst FAILED\n"); 530 printk("cat_subwrite: cat_sendinst FAILED\n");
527 return 1; 531 return 1;
528 } 532 }
529 for(i = 0; i < len; i++) { 533 for (i = 0; i < len; i++) {
530 if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) { 534 if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) {
531 printk("cat_subwrite: cat_sendata element at %d FAILED\n", i); 535 printk
536 ("cat_subwrite: cat_sendata element at %d FAILED\n",
537 i);
532 return 1; 538 return 1;
533 } 539 }
534 } 540 }
535 return 0; 541 return 0;
536} 542}
537static int 543static int
538cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, 544cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
539 __u16 len, void *buf) 545 __u16 len, void *buf)
540{ 546{
541 int i, retval; 547 int i, retval;
542 548
543 if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { 549 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
544 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); 550 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
545 return retval; 551 return retval;
546 } 552 }
547 553
548 if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { 554 if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
549 CDEBUG(("cat_subread: cat_sendinst failed\n")); 555 CDEBUG(("cat_subread: cat_sendinst failed\n"));
550 return 1; 556 return 1;
551 } 557 }
552 for(i = 0; i < len; i++) { 558 for (i = 0; i < len; i++) {
553 if(cat_getdata(modp, asicp, 0xFF, 559 if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) {
554 &((__u8 *)buf)[i])) { 560 CDEBUG(("cat_subread: cat_getdata element %d failed\n",
555 CDEBUG(("cat_subread: cat_getdata element %d failed\n", i)); 561 i));
556 return 1; 562 return 1;
557 } 563 }
558 } 564 }
559 return 0; 565 return 0;
560} 566}
561 567
562
563/* buffer for storing EPROM data read in during initialisation */ 568/* buffer for storing EPROM data read in during initialisation */
564static __initdata __u8 eprom_buf[0xFFFF]; 569static __initdata __u8 eprom_buf[0xFFFF];
565static voyager_module_t *voyager_initial_module; 570static voyager_module_t *voyager_initial_module;
@@ -568,8 +573,7 @@ static voyager_module_t *voyager_initial_module;
568 * boot cpu *after* all memory initialisation has been done (so we can 573 * boot cpu *after* all memory initialisation has been done (so we can
569 * use kmalloc) but before smp initialisation, so we can probe the SMP 574 * use kmalloc) but before smp initialisation, so we can probe the SMP
570 * configuration and pick up necessary information. */ 575 * configuration and pick up necessary information. */
571void __init 576void __init voyager_cat_init(void)
572voyager_cat_init(void)
573{ 577{
574 voyager_module_t **modpp = &voyager_initial_module; 578 voyager_module_t **modpp = &voyager_initial_module;
575 voyager_asic_t **asicpp; 579 voyager_asic_t **asicpp;
@@ -578,27 +582,29 @@ voyager_cat_init(void)
578 unsigned long qic_addr = 0; 582 unsigned long qic_addr = 0;
579 __u8 qabc_data[0x20]; 583 __u8 qabc_data[0x20];
580 __u8 num_submodules, val; 584 __u8 num_submodules, val;
581 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0]; 585 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0];
582 586
583 __u8 cmos[4]; 587 __u8 cmos[4];
584 unsigned long addr; 588 unsigned long addr;
585 589
586 /* initiallise the SUS mailbox */ 590 /* initiallise the SUS mailbox */
587 for(i=0; i<sizeof(cmos); i++) 591 for (i = 0; i < sizeof(cmos); i++)
588 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); 592 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
589 addr = *(unsigned long *)cmos; 593 addr = *(unsigned long *)cmos;
590 if((addr & 0xff000000) != 0xff000000) { 594 if ((addr & 0xff000000) != 0xff000000) {
591 printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr); 595 printk(KERN_ERR
596 "Voyager failed to get SUS mailbox (addr = 0x%lx\n",
597 addr);
592 } else { 598 } else {
593 static struct resource res; 599 static struct resource res;
594 600
595 res.name = "voyager SUS"; 601 res.name = "voyager SUS";
596 res.start = addr; 602 res.start = addr;
597 res.end = addr+0x3ff; 603 res.end = addr + 0x3ff;
598 604
599 request_resource(&iomem_resource, &res); 605 request_resource(&iomem_resource, &res);
600 voyager_SUS = (struct voyager_SUS *) 606 voyager_SUS = (struct voyager_SUS *)
601 ioremap(addr, 0x400); 607 ioremap(addr, 0x400);
602 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", 608 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
603 voyager_SUS->SUS_version); 609 voyager_SUS->SUS_version);
604 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; 610 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
@@ -609,8 +615,6 @@ voyager_cat_init(void)
609 voyager_extended_vic_processors = 0; 615 voyager_extended_vic_processors = 0;
610 voyager_quad_processors = 0; 616 voyager_quad_processors = 0;
611 617
612
613
614 printk("VOYAGER: beginning CAT bus probe\n"); 618 printk("VOYAGER: beginning CAT bus probe\n");
615 /* set up the SuperSet Port Block which tells us where the 619 /* set up the SuperSet Port Block which tells us where the
616 * CAT communication port is */ 620 * CAT communication port is */
@@ -618,14 +622,14 @@ voyager_cat_init(void)
618 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); 622 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
619 623
620 /* now find out if were 8 slot or normal */ 624 /* now find out if were 8 slot or normal */
621 if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) 625 if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
622 == EIGHT_SLOT_IDENTIFIER) { 626 == EIGHT_SLOT_IDENTIFIER) {
623 voyager_8slot = 1; 627 voyager_8slot = 1;
624 printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n"); 628 printk(KERN_NOTICE
629 "Voyager: Eight slot 51xx configuration detected\n");
625 } 630 }
626 631
627 for(i = VOYAGER_MIN_MODULE; 632 for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) {
628 i <= VOYAGER_MAX_MODULE; i++) {
629 __u8 input; 633 __u8 input;
630 int asic; 634 int asic;
631 __u16 eprom_size; 635 __u16 eprom_size;
@@ -643,21 +647,21 @@ voyager_cat_init(void)
643 outb(0xAA, CAT_DATA); 647 outb(0xAA, CAT_DATA);
644 input = inb(CAT_DATA); 648 input = inb(CAT_DATA);
645 outb(VOYAGER_CAT_END, CAT_CMD); 649 outb(VOYAGER_CAT_END, CAT_CMD);
646 if(input != VOYAGER_CAT_HEADER) { 650 if (input != VOYAGER_CAT_HEADER) {
647 continue; 651 continue;
648 } 652 }
649 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, 653 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
650 cat_module_name(i))); 654 cat_module_name(i)));
651 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/ 655 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */
652 if(*modpp == NULL) { 656 if (*modpp == NULL) {
653 printk("**WARNING** kmalloc failure in cat_init\n"); 657 printk("**WARNING** kmalloc failure in cat_init\n");
654 continue; 658 continue;
655 } 659 }
656 memset(*modpp, 0, sizeof(voyager_module_t)); 660 memset(*modpp, 0, sizeof(voyager_module_t));
657 /* need temporary asic for cat_subread. It will be 661 /* need temporary asic for cat_subread. It will be
658 * filled in correctly later */ 662 * filled in correctly later */
659 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/ 663 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */
660 if((*modpp)->asic == NULL) { 664 if ((*modpp)->asic == NULL) {
661 printk("**WARNING** kmalloc failure in cat_init\n"); 665 printk("**WARNING** kmalloc failure in cat_init\n");
662 continue; 666 continue;
663 } 667 }
@@ -666,47 +670,52 @@ voyager_cat_init(void)
666 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; 670 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
667 (*modpp)->module_addr = i; 671 (*modpp)->module_addr = i;
668 (*modpp)->scan_path_connected = 0; 672 (*modpp)->scan_path_connected = 0;
669 if(i == VOYAGER_PSI) { 673 if (i == VOYAGER_PSI) {
670 /* Exception leg for modules with no EEPROM */ 674 /* Exception leg for modules with no EEPROM */
671 printk("Module \"%s\"\n", cat_module_name(i)); 675 printk("Module \"%s\"\n", cat_module_name(i));
672 continue; 676 continue;
673 } 677 }
674 678
675 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); 679 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
676 outb(VOYAGER_CAT_RUN, CAT_CMD); 680 outb(VOYAGER_CAT_RUN, CAT_CMD);
677 cat_disconnect(*modpp, (*modpp)->asic); 681 cat_disconnect(*modpp, (*modpp)->asic);
678 if(cat_subread(*modpp, (*modpp)->asic, 682 if (cat_subread(*modpp, (*modpp)->asic,
679 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), 683 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
680 &eprom_size)) { 684 &eprom_size)) {
681 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); 685 printk
686 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
687 i);
682 outb(VOYAGER_CAT_END, CAT_CMD); 688 outb(VOYAGER_CAT_END, CAT_CMD);
683 continue; 689 continue;
684 } 690 }
685 if(eprom_size > sizeof(eprom_buf)) { 691 if (eprom_size > sizeof(eprom_buf)) {
686 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); 692 printk
693 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
694 i, eprom_size);
687 outb(VOYAGER_CAT_END, CAT_CMD); 695 outb(VOYAGER_CAT_END, CAT_CMD);
688 continue; 696 continue;
689 } 697 }
690 outb(VOYAGER_CAT_END, CAT_CMD); 698 outb(VOYAGER_CAT_END, CAT_CMD);
691 outb(VOYAGER_CAT_RUN, CAT_CMD); 699 outb(VOYAGER_CAT_RUN, CAT_CMD);
692 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); 700 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
693 if(cat_subread(*modpp, (*modpp)->asic, 0, 701 eprom_size));
694 eprom_size, eprom_buf)) { 702 if (cat_subread
703 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
695 outb(VOYAGER_CAT_END, CAT_CMD); 704 outb(VOYAGER_CAT_END, CAT_CMD);
696 continue; 705 continue;
697 } 706 }
698 outb(VOYAGER_CAT_END, CAT_CMD); 707 outb(VOYAGER_CAT_END, CAT_CMD);
699 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", 708 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
700 cat_module_name(i), eprom_hdr->version_id, 709 cat_module_name(i), eprom_hdr->version_id,
701 *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics); 710 *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics);
702 (*modpp)->ee_size = eprom_hdr->ee_size; 711 (*modpp)->ee_size = eprom_hdr->ee_size;
703 (*modpp)->num_asics = eprom_hdr->num_asics; 712 (*modpp)->num_asics = eprom_hdr->num_asics;
704 asicpp = &((*modpp)->asic); 713 asicpp = &((*modpp)->asic);
705 sp_offset = eprom_hdr->scan_path_offset; 714 sp_offset = eprom_hdr->scan_path_offset;
706 /* All we really care about are the Quad cards. We 715 /* All we really care about are the Quad cards. We
707 * identify them because they are in a processor slot 716 * identify them because they are in a processor slot
708 * and have only four asics */ 717 * and have only four asics */
709 if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) { 718 if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) {
710 modpp = &((*modpp)->next); 719 modpp = &((*modpp)->next);
711 continue; 720 continue;
712 } 721 }
@@ -717,16 +726,17 @@ voyager_cat_init(void)
717 &num_submodules); 726 &num_submodules);
718 /* lowest two bits, active low */ 727 /* lowest two bits, active low */
719 num_submodules = ~(0xfc | num_submodules); 728 num_submodules = ~(0xfc | num_submodules);
720 CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules)); 729 CDEBUG(("VOYAGER CAT: %d submodules present\n",
721 if(num_submodules == 0) { 730 num_submodules));
731 if (num_submodules == 0) {
722 /* fill in the dyadic extended processors */ 732 /* fill in the dyadic extended processors */
723 __u8 cpu = i & 0x07; 733 __u8 cpu = i & 0x07;
724 734
725 printk("Module \"%s\": Dyadic Processor Card\n", 735 printk("Module \"%s\": Dyadic Processor Card\n",
726 cat_module_name(i)); 736 cat_module_name(i));
727 voyager_extended_vic_processors |= (1<<cpu); 737 voyager_extended_vic_processors |= (1 << cpu);
728 cpu += 4; 738 cpu += 4;
729 voyager_extended_vic_processors |= (1<<cpu); 739 voyager_extended_vic_processors |= (1 << cpu);
730 outb(VOYAGER_CAT_END, CAT_CMD); 740 outb(VOYAGER_CAT_END, CAT_CMD);
731 continue; 741 continue;
732 } 742 }
@@ -740,28 +750,32 @@ voyager_cat_init(void)
740 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); 750 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
741 751
742 outb(VOYAGER_CAT_END, CAT_CMD); 752 outb(VOYAGER_CAT_END, CAT_CMD);
743
744 753
745 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); 754 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
746 outb(VOYAGER_CAT_RUN, CAT_CMD); 755 outb(VOYAGER_CAT_RUN, CAT_CMD);
747 cat_disconnect(*modpp, (*modpp)->asic); 756 cat_disconnect(*modpp, (*modpp)->asic);
748 if(cat_subread(*modpp, (*modpp)->asic, 757 if (cat_subread(*modpp, (*modpp)->asic,
749 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), 758 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
750 &eprom_size)) { 759 &eprom_size)) {
751 printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); 760 printk
761 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
762 i);
752 outb(VOYAGER_CAT_END, CAT_CMD); 763 outb(VOYAGER_CAT_END, CAT_CMD);
753 continue; 764 continue;
754 } 765 }
755 if(eprom_size > sizeof(eprom_buf)) { 766 if (eprom_size > sizeof(eprom_buf)) {
756 printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); 767 printk
768 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
769 i, eprom_size);
757 outb(VOYAGER_CAT_END, CAT_CMD); 770 outb(VOYAGER_CAT_END, CAT_CMD);
758 continue; 771 continue;
759 } 772 }
760 outb(VOYAGER_CAT_END, CAT_CMD); 773 outb(VOYAGER_CAT_END, CAT_CMD);
761 outb(VOYAGER_CAT_RUN, CAT_CMD); 774 outb(VOYAGER_CAT_RUN, CAT_CMD);
762 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); 775 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
763 if(cat_subread(*modpp, (*modpp)->asic, 0, 776 eprom_size));
764 eprom_size, eprom_buf)) { 777 if (cat_subread
778 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
765 outb(VOYAGER_CAT_END, CAT_CMD); 779 outb(VOYAGER_CAT_END, CAT_CMD);
766 continue; 780 continue;
767 } 781 }
@@ -773,30 +787,35 @@ voyager_cat_init(void)
773 sp_offset = eprom_hdr->scan_path_offset; 787 sp_offset = eprom_hdr->scan_path_offset;
774 /* get rid of the dummy CAT asic and read the real one */ 788 /* get rid of the dummy CAT asic and read the real one */
775 kfree((*modpp)->asic); 789 kfree((*modpp)->asic);
776 for(asic=0; asic < (*modpp)->num_asics; asic++) { 790 for (asic = 0; asic < (*modpp)->num_asics; asic++) {
777 int j; 791 int j;
778 voyager_asic_t *asicp = *asicpp 792 voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */
779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table; 793 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table; 794 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table; 795 voyager_jtt_t *jtag_table;
783 796
784 if(asicp == NULL) { 797 if (asicp == NULL) {
785 printk("**WARNING** kmalloc failure in cat_init\n"); 798 printk
799 ("**WARNING** kmalloc failure in cat_init\n");
786 continue; 800 continue;
787 } 801 }
788 asicpp = &(asicp->next); 802 asicpp = &(asicp->next);
789 asicp->asic_location = asic; 803 asicp->asic_location = asic;
790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); 804 sp_table =
805 (voyager_sp_table_t *) (eprom_buf + sp_offset);
791 asicp->asic_id = sp_table->asic_id; 806 asicp->asic_id = sp_table->asic_id;
792 asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset); 807 asic_table =
793 for(j=0; j<4; j++) 808 (voyager_at_t *) (eprom_buf +
809 sp_table->asic_data_offset);
810 for (j = 0; j < 4; j++)
794 asicp->jtag_id[j] = asic_table->jtag_id[j]; 811 asicp->jtag_id[j] = asic_table->jtag_id[j];
795 jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset); 812 jtag_table =
813 (voyager_jtt_t *) (eprom_buf +
814 asic_table->jtag_offset);
796 asicp->ireg_length = jtag_table->ireg_len; 815 asicp->ireg_length = jtag_table->ireg_len;
797 asicp->bit_location = (*modpp)->inst_bits; 816 asicp->bit_location = (*modpp)->inst_bits;
798 (*modpp)->inst_bits += asicp->ireg_length; 817 (*modpp)->inst_bits += asicp->ireg_length;
799 if(asicp->ireg_length > (*modpp)->largest_reg) 818 if (asicp->ireg_length > (*modpp)->largest_reg)
800 (*modpp)->largest_reg = asicp->ireg_length; 819 (*modpp)->largest_reg = asicp->ireg_length;
801 if (asicp->ireg_length < (*modpp)->smallest_reg || 820 if (asicp->ireg_length < (*modpp)->smallest_reg ||
802 (*modpp)->smallest_reg == 0) 821 (*modpp)->smallest_reg == 0)
@@ -804,15 +823,13 @@ voyager_cat_init(void)
804 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", 823 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
805 asicp->asic_id, asicp->ireg_length, 824 asicp->asic_id, asicp->ireg_length,
806 asicp->bit_location)); 825 asicp->bit_location));
807 if(asicp->asic_id == VOYAGER_QUAD_QABC) { 826 if (asicp->asic_id == VOYAGER_QUAD_QABC) {
808 CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); 827 CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
809 qabc_asic = asicp; 828 qabc_asic = asicp;
810 } 829 }
811 sp_offset += sizeof(voyager_sp_table_t); 830 sp_offset += sizeof(voyager_sp_table_t);
812 } 831 }
813 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", 832 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg));
814 (*modpp)->inst_bits, (*modpp)->largest_reg,
815 (*modpp)->smallest_reg));
816 /* OK, now we have the QUAD ASICs set up, use them. 833 /* OK, now we have the QUAD ASICs set up, use them.
817 * we need to: 834 * we need to:
818 * 835 *
@@ -828,10 +845,11 @@ voyager_cat_init(void)
828 qic_addr = qabc_data[5] << 8; 845 qic_addr = qabc_data[5] << 8;
829 qic_addr = (qic_addr | qabc_data[6]) << 8; 846 qic_addr = (qic_addr | qabc_data[6]) << 8;
830 qic_addr = (qic_addr | qabc_data[7]) << 8; 847 qic_addr = (qic_addr | qabc_data[7]) << 8;
831 printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", 848 printk
832 cat_module_name(i), qic_addr, qabc_data[8]); 849 ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
850 cat_module_name(i), qic_addr, qabc_data[8]);
833#if 0 /* plumbing fails---FIXME */ 851#if 0 /* plumbing fails---FIXME */
834 if((qabc_data[8] & 0xf0) == 0) { 852 if ((qabc_data[8] & 0xf0) == 0) {
835 /* FIXME: 32 way 8 CPU slot monster cannot be 853 /* FIXME: 32 way 8 CPU slot monster cannot be
836 * plumbed this way---need to check for it */ 854 * plumbed this way---need to check for it */
837 855
@@ -842,94 +860,97 @@ voyager_cat_init(void)
842#ifdef VOYAGER_CAT_DEBUG 860#ifdef VOYAGER_CAT_DEBUG
843 /* verify plumbing */ 861 /* verify plumbing */
844 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); 862 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
845 if((qabc_data[8] & 0xf0) == 0) { 863 if ((qabc_data[8] & 0xf0) == 0) {
846 CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8])); 864 CDEBUG(("PLUMBING FAILED: 0x%x\n",
865 qabc_data[8]));
847 } 866 }
848#endif 867#endif
849 } 868 }
850#endif 869#endif
851 870
852 { 871 {
853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); 872 struct resource *res =
873 kzalloc(sizeof(struct resource), GFP_KERNEL);
854 res->name = kmalloc(128, GFP_KERNEL); 874 res->name = kmalloc(128, GFP_KERNEL);
855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); 875 sprintf((char *)res->name, "Voyager %s Quad CPI",
876 cat_module_name(i));
856 res->start = qic_addr; 877 res->start = qic_addr;
857 res->end = qic_addr + 0x3ff; 878 res->end = qic_addr + 0x3ff;
858 request_resource(&iomem_resource, res); 879 request_resource(&iomem_resource, res);
859 } 880 }
860 881
861 qic_addr = (unsigned long)ioremap(qic_addr, 0x400); 882 qic_addr = (unsigned long)ioremap(qic_addr, 0x400);
862 883
863 for(j = 0; j < 4; j++) { 884 for (j = 0; j < 4; j++) {
864 __u8 cpu; 885 __u8 cpu;
865 886
866 if(voyager_8slot) { 887 if (voyager_8slot) {
867 /* 8 slot has a different mapping, 888 /* 8 slot has a different mapping,
868 * each slot has only one vic line, so 889 * each slot has only one vic line, so
869 * 1 cpu in each slot must be < 8 */ 890 * 1 cpu in each slot must be < 8 */
870 cpu = (i & 0x07) + j*8; 891 cpu = (i & 0x07) + j * 8;
871 } else { 892 } else {
872 cpu = (i & 0x03) + j*4; 893 cpu = (i & 0x03) + j * 4;
873 } 894 }
874 if( (qabc_data[8] & (1<<j))) { 895 if ((qabc_data[8] & (1 << j))) {
875 voyager_extended_vic_processors |= (1<<cpu); 896 voyager_extended_vic_processors |= (1 << cpu);
876 } 897 }
877 if(qabc_data[8] & (1<<(j+4)) ) { 898 if (qabc_data[8] & (1 << (j + 4))) {
878 /* Second SET register plumbed: Quad 899 /* Second SET register plumbed: Quad
879 * card has two VIC connected CPUs. 900 * card has two VIC connected CPUs.
880 * Secondary cannot be booted as a VIC 901 * Secondary cannot be booted as a VIC
881 * CPU */ 902 * CPU */
882 voyager_extended_vic_processors |= (1<<cpu); 903 voyager_extended_vic_processors |= (1 << cpu);
883 voyager_allowed_boot_processors &= (~(1<<cpu)); 904 voyager_allowed_boot_processors &=
905 (~(1 << cpu));
884 } 906 }
885 907
886 voyager_quad_processors |= (1<<cpu); 908 voyager_quad_processors |= (1 << cpu);
887 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) 909 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
888 (qic_addr+(j<<8)); 910 (qic_addr + (j << 8));
889 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, 911 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
890 (unsigned long)voyager_quad_cpi_addr[cpu])); 912 (unsigned long)voyager_quad_cpi_addr[cpu]));
891 } 913 }
892 outb(VOYAGER_CAT_END, CAT_CMD); 914 outb(VOYAGER_CAT_END, CAT_CMD);
893 915
894
895
896 *asicpp = NULL; 916 *asicpp = NULL;
897 modpp = &((*modpp)->next); 917 modpp = &((*modpp)->next);
898 } 918 }
899 *modpp = NULL; 919 *modpp = NULL;
900 printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors); 920 printk
921 ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n",
922 voyager_extended_vic_processors, voyager_quad_processors,
923 voyager_allowed_boot_processors);
901 request_resource(&ioport_resource, &vic_res); 924 request_resource(&ioport_resource, &vic_res);
902 if(voyager_quad_processors) 925 if (voyager_quad_processors)
903 request_resource(&ioport_resource, &qic_res); 926 request_resource(&ioport_resource, &qic_res);
904 /* set up the front power switch */ 927 /* set up the front power switch */
905} 928}
906 929
907int 930int voyager_cat_readb(__u8 module, __u8 asic, int reg)
908voyager_cat_readb(__u8 module, __u8 asic, int reg)
909{ 931{
910 return 0; 932 return 0;
911} 933}
912 934
913static int 935static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp)
914cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
915{ 936{
916 __u8 val; 937 __u8 val;
917 int err = 0; 938 int err = 0;
918 939
919 if(!modp->scan_path_connected) 940 if (!modp->scan_path_connected)
920 return 0; 941 return 0;
921 if(asicp->asic_id != VOYAGER_CAT_ID) { 942 if (asicp->asic_id != VOYAGER_CAT_ID) {
922 CDEBUG(("cat_disconnect: ASIC is not CAT\n")); 943 CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
923 return 1; 944 return 1;
924 } 945 }
925 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); 946 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
926 if(err) { 947 if (err) {
927 CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); 948 CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
928 return err; 949 return err;
929 } 950 }
930 val &= VOYAGER_DISCONNECT_ASIC; 951 val &= VOYAGER_DISCONNECT_ASIC;
931 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); 952 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
932 if(err) { 953 if (err) {
933 CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); 954 CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
934 return err; 955 return err;
935 } 956 }
@@ -940,27 +961,26 @@ cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
940 return 0; 961 return 0;
941} 962}
942 963
943static int 964static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp)
944cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
945{ 965{
946 __u8 val; 966 __u8 val;
947 int err = 0; 967 int err = 0;
948 968
949 if(modp->scan_path_connected) 969 if (modp->scan_path_connected)
950 return 0; 970 return 0;
951 if(asicp->asic_id != VOYAGER_CAT_ID) { 971 if (asicp->asic_id != VOYAGER_CAT_ID) {
952 CDEBUG(("cat_connect: ASIC is not CAT\n")); 972 CDEBUG(("cat_connect: ASIC is not CAT\n"));
953 return 1; 973 return 1;
954 } 974 }
955 975
956 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); 976 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
957 if(err) { 977 if (err) {
958 CDEBUG(("cat_connect: failed to read SCANPATH\n")); 978 CDEBUG(("cat_connect: failed to read SCANPATH\n"));
959 return err; 979 return err;
960 } 980 }
961 val |= VOYAGER_CONNECT_ASIC; 981 val |= VOYAGER_CONNECT_ASIC;
962 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); 982 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
963 if(err) { 983 if (err) {
964 CDEBUG(("cat_connect: failed to write SCANPATH\n")); 984 CDEBUG(("cat_connect: failed to write SCANPATH\n"));
965 return err; 985 return err;
966 } 986 }
@@ -971,11 +991,10 @@ cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
971 return 0; 991 return 0;
972} 992}
973 993
974void 994void voyager_cat_power_off(void)
975voyager_cat_power_off(void)
976{ 995{
977 /* Power the machine off by writing to the PSI over the CAT 996 /* Power the machine off by writing to the PSI over the CAT
978 * bus */ 997 * bus */
979 __u8 data; 998 __u8 data;
980 voyager_module_t psi = { 0 }; 999 voyager_module_t psi = { 0 };
981 voyager_asic_t psi_asic = { 0 }; 1000 voyager_asic_t psi_asic = { 0 };
@@ -1009,8 +1028,7 @@ voyager_cat_power_off(void)
1009 1028
1010struct voyager_status voyager_status = { 0 }; 1029struct voyager_status voyager_status = { 0 };
1011 1030
1012void 1031void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data)
1013voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1014{ 1032{
1015 voyager_module_t psi = { 0 }; 1033 voyager_module_t psi = { 0 };
1016 voyager_asic_t psi_asic = { 0 }; 1034 voyager_asic_t psi_asic = { 0 };
@@ -1027,7 +1045,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1027 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); 1045 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1028 outb(VOYAGER_CAT_RUN, CAT_CMD); 1046 outb(VOYAGER_CAT_RUN, CAT_CMD);
1029 cat_disconnect(&psi, &psi_asic); 1047 cat_disconnect(&psi, &psi_asic);
1030 switch(cmd) { 1048 switch (cmd) {
1031 case VOYAGER_PSI_READ: 1049 case VOYAGER_PSI_READ:
1032 cat_read(&psi, &psi_asic, reg, data); 1050 cat_read(&psi, &psi_asic, reg, data);
1033 break; 1051 break;
@@ -1047,8 +1065,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
1047 outb(VOYAGER_CAT_END, CAT_CMD); 1065 outb(VOYAGER_CAT_END, CAT_CMD);
1048} 1066}
1049 1067
1050void 1068void voyager_cat_do_common_interrupt(void)
1051voyager_cat_do_common_interrupt(void)
1052{ 1069{
1053 /* This is caused either by a memory parity error or something 1070 /* This is caused either by a memory parity error or something
1054 * in the PSI */ 1071 * in the PSI */
@@ -1057,7 +1074,7 @@ voyager_cat_do_common_interrupt(void)
1057 voyager_asic_t psi_asic = { 0 }; 1074 voyager_asic_t psi_asic = { 0 };
1058 struct voyager_psi psi_reg; 1075 struct voyager_psi psi_reg;
1059 int i; 1076 int i;
1060 re_read: 1077 re_read:
1061 psi.asic = &psi_asic; 1078 psi.asic = &psi_asic;
1062 psi.asic->asic_id = VOYAGER_CAT_ID; 1079 psi.asic->asic_id = VOYAGER_CAT_ID;
1063 psi.asic->subaddr = VOYAGER_SUBADDR_HI; 1080 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
@@ -1072,43 +1089,45 @@ voyager_cat_do_common_interrupt(void)
1072 cat_disconnect(&psi, &psi_asic); 1089 cat_disconnect(&psi, &psi_asic);
1073 /* Read the status. NOTE: Need to read *all* the PSI regs here 1090 /* Read the status. NOTE: Need to read *all* the PSI regs here
1074 * otherwise the cmn int will be reasserted */ 1091 * otherwise the cmn int will be reasserted */
1075 for(i = 0; i < sizeof(psi_reg.regs); i++) { 1092 for (i = 0; i < sizeof(psi_reg.regs); i++) {
1076 cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]); 1093 cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]);
1077 } 1094 }
1078 outb(VOYAGER_CAT_END, CAT_CMD); 1095 outb(VOYAGER_CAT_END, CAT_CMD);
1079 if((psi_reg.regs.checkbit & 0x02) == 0) { 1096 if ((psi_reg.regs.checkbit & 0x02) == 0) {
1080 psi_reg.regs.checkbit |= 0x02; 1097 psi_reg.regs.checkbit |= 0x02;
1081 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); 1098 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
1082 printk("VOYAGER RE-READ PSI\n"); 1099 printk("VOYAGER RE-READ PSI\n");
1083 goto re_read; 1100 goto re_read;
1084 } 1101 }
1085 outb(VOYAGER_CAT_RUN, CAT_CMD); 1102 outb(VOYAGER_CAT_RUN, CAT_CMD);
1086 for(i = 0; i < sizeof(psi_reg.subregs); i++) { 1103 for (i = 0; i < sizeof(psi_reg.subregs); i++) {
1087 /* This looks strange, but the PSI doesn't do auto increment 1104 /* This looks strange, but the PSI doesn't do auto increment
1088 * correctly */ 1105 * correctly */
1089 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, 1106 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i,
1090 1, &((__u8 *)&psi_reg.subregs)[i]); 1107 1, &((__u8 *) & psi_reg.subregs)[i]);
1091 } 1108 }
1092 outb(VOYAGER_CAT_END, CAT_CMD); 1109 outb(VOYAGER_CAT_END, CAT_CMD);
1093#ifdef VOYAGER_CAT_DEBUG 1110#ifdef VOYAGER_CAT_DEBUG
1094 printk("VOYAGER PSI: "); 1111 printk("VOYAGER PSI: ");
1095 for(i=0; i<sizeof(psi_reg.regs); i++) 1112 for (i = 0; i < sizeof(psi_reg.regs); i++)
1096 printk("%02x ", ((__u8 *)&psi_reg.regs)[i]); 1113 printk("%02x ", ((__u8 *) & psi_reg.regs)[i]);
1097 printk("\n "); 1114 printk("\n ");
1098 for(i=0; i<sizeof(psi_reg.subregs); i++) 1115 for (i = 0; i < sizeof(psi_reg.subregs); i++)
1099 printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]); 1116 printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]);
1100 printk("\n"); 1117 printk("\n");
1101#endif 1118#endif
1102 if(psi_reg.regs.intstatus & PSI_MON) { 1119 if (psi_reg.regs.intstatus & PSI_MON) {
1103 /* switch off or power fail */ 1120 /* switch off or power fail */
1104 1121
1105 if(psi_reg.subregs.supply & PSI_SWITCH_OFF) { 1122 if (psi_reg.subregs.supply & PSI_SWITCH_OFF) {
1106 if(voyager_status.switch_off) { 1123 if (voyager_status.switch_off) {
1107 printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n"); 1124 printk(KERN_ERR
1125 "Voyager front panel switch turned off again---Immediate power off!\n");
1108 voyager_cat_power_off(); 1126 voyager_cat_power_off();
1109 /* not reached */ 1127 /* not reached */
1110 } else { 1128 } else {
1111 printk(KERN_ERR "Voyager front panel switch turned off\n"); 1129 printk(KERN_ERR
1130 "Voyager front panel switch turned off\n");
1112 voyager_status.switch_off = 1; 1131 voyager_status.switch_off = 1;
1113 voyager_status.request_from_kernel = 1; 1132 voyager_status.request_from_kernel = 1;
1114 wake_up_process(voyager_thread); 1133 wake_up_process(voyager_thread);
@@ -1127,7 +1146,7 @@ voyager_cat_do_common_interrupt(void)
1127 1146
1128 VDEBUG(("Voyager ac fail reg 0x%x\n", 1147 VDEBUG(("Voyager ac fail reg 0x%x\n",
1129 psi_reg.subregs.ACfail)); 1148 psi_reg.subregs.ACfail));
1130 if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { 1149 if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
1131 /* No further update */ 1150 /* No further update */
1132 return; 1151 return;
1133 } 1152 }
@@ -1135,20 +1154,20 @@ voyager_cat_do_common_interrupt(void)
1135 /* Don't bother trying to find out who failed. 1154 /* Don't bother trying to find out who failed.
1136 * FIXME: This probably makes the code incorrect on 1155 * FIXME: This probably makes the code incorrect on
1137 * anything other than a 345x */ 1156 * anything other than a 345x */
1138 for(i=0; i< 5; i++) { 1157 for (i = 0; i < 5; i++) {
1139 if( psi_reg.subregs.ACfail &(1<<i)) { 1158 if (psi_reg.subregs.ACfail & (1 << i)) {
1140 break; 1159 break;
1141 } 1160 }
1142 } 1161 }
1143 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); 1162 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
1144#endif 1163#endif
1145 /* DON'T do this: it shuts down the AC PSI 1164 /* DON'T do this: it shuts down the AC PSI
1146 outb(VOYAGER_CAT_RUN, CAT_CMD); 1165 outb(VOYAGER_CAT_RUN, CAT_CMD);
1147 data = PSI_MASK_MASK | i; 1166 data = PSI_MASK_MASK | i;
1148 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, 1167 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
1149 1, &data); 1168 1, &data);
1150 outb(VOYAGER_CAT_END, CAT_CMD); 1169 outb(VOYAGER_CAT_END, CAT_CMD);
1151 */ 1170 */
1152 printk(KERN_ERR "Voyager AC power failure\n"); 1171 printk(KERN_ERR "Voyager AC power failure\n");
1153 outb(VOYAGER_CAT_RUN, CAT_CMD); 1172 outb(VOYAGER_CAT_RUN, CAT_CMD);
1154 data = PSI_COLD_START; 1173 data = PSI_COLD_START;
@@ -1159,16 +1178,16 @@ voyager_cat_do_common_interrupt(void)
1159 voyager_status.request_from_kernel = 1; 1178 voyager_status.request_from_kernel = 1;
1160 wake_up_process(voyager_thread); 1179 wake_up_process(voyager_thread);
1161 } 1180 }
1162 1181
1163 1182 } else if (psi_reg.regs.intstatus & PSI_FAULT) {
1164 } else if(psi_reg.regs.intstatus & PSI_FAULT) {
1165 /* Major fault! */ 1183 /* Major fault! */
1166 printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n"); 1184 printk(KERN_ERR
1185 "Voyager PSI Detected major fault, immediate power off!\n");
1167 voyager_cat_power_off(); 1186 voyager_cat_power_off();
1168 /* not reached */ 1187 /* not reached */
1169 } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM 1188 } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
1170 | PSI_CURRENT | PSI_DVM 1189 | PSI_CURRENT | PSI_DVM
1171 | PSI_PSCFAULT | PSI_STAT_CHG)) { 1190 | PSI_PSCFAULT | PSI_STAT_CHG)) {
1172 /* other psi fault */ 1191 /* other psi fault */
1173 1192
1174 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); 1193 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 88124dd35406..dffa786f61fe 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -32,7 +32,8 @@
32DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 }; 32DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 };
33 33
34/* CPU IRQ affinity -- set to all ones initially */ 34/* CPU IRQ affinity -- set to all ones initially */
35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL }; 35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned =
36 {[0 ... NR_CPUS-1] = ~0UL };
36 37
37/* per CPU data structure (for /proc/cpuinfo et al), visible externally 38/* per CPU data structure (for /proc/cpuinfo et al), visible externally
38 * indexed physically */ 39 * indexed physically */
@@ -76,7 +77,6 @@ EXPORT_SYMBOL(cpu_online_map);
76 * by scheduler but indexed physically */ 77 * by scheduler but indexed physically */
77cpumask_t phys_cpu_present_map = CPU_MASK_NONE; 78cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
78 79
79
80/* The internal functions */ 80/* The internal functions */
81static void send_CPI(__u32 cpuset, __u8 cpi); 81static void send_CPI(__u32 cpuset, __u8 cpi);
82static void ack_CPI(__u8 cpi); 82static void ack_CPI(__u8 cpi);
@@ -101,94 +101,86 @@ int hard_smp_processor_id(void);
101int safe_smp_processor_id(void); 101int safe_smp_processor_id(void);
102 102
103/* Inline functions */ 103/* Inline functions */
104static inline void 104static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi)
105send_one_QIC_CPI(__u8 cpu, __u8 cpi)
106{ 105{
107 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = 106 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
108 (smp_processor_id() << 16) + cpi; 107 (smp_processor_id() << 16) + cpi;
109} 108}
110 109
111static inline void 110static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi)
112send_QIC_CPI(__u32 cpuset, __u8 cpi)
113{ 111{
114 int cpu; 112 int cpu;
115 113
116 for_each_online_cpu(cpu) { 114 for_each_online_cpu(cpu) {
117 if(cpuset & (1<<cpu)) { 115 if (cpuset & (1 << cpu)) {
118#ifdef VOYAGER_DEBUG 116#ifdef VOYAGER_DEBUG
119 if(!cpu_isset(cpu, cpu_online_map)) 117 if (!cpu_isset(cpu, cpu_online_map))
120 VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); 118 VDEBUG(("CPU%d sending cpi %d to CPU%d not in "
119 "cpu_online_map\n",
120 hard_smp_processor_id(), cpi, cpu));
121#endif 121#endif
122 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); 122 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
123 } 123 }
124 } 124 }
125} 125}
126 126
127static inline void 127static inline void wrapper_smp_local_timer_interrupt(void)
128wrapper_smp_local_timer_interrupt(void)
129{ 128{
130 irq_enter(); 129 irq_enter();
131 smp_local_timer_interrupt(); 130 smp_local_timer_interrupt();
132 irq_exit(); 131 irq_exit();
133} 132}
134 133
135static inline void 134static inline void send_one_CPI(__u8 cpu, __u8 cpi)
136send_one_CPI(__u8 cpu, __u8 cpi)
137{ 135{
138 if(voyager_quad_processors & (1<<cpu)) 136 if (voyager_quad_processors & (1 << cpu))
139 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); 137 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
140 else 138 else
141 send_CPI(1<<cpu, cpi); 139 send_CPI(1 << cpu, cpi);
142} 140}
143 141
144static inline void 142static inline void send_CPI_allbutself(__u8 cpi)
145send_CPI_allbutself(__u8 cpi)
146{ 143{
147 __u8 cpu = smp_processor_id(); 144 __u8 cpu = smp_processor_id();
148 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); 145 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
149 send_CPI(mask, cpi); 146 send_CPI(mask, cpi);
150} 147}
151 148
152static inline int 149static inline int is_cpu_quad(void)
153is_cpu_quad(void)
154{ 150{
155 __u8 cpumask = inb(VIC_PROC_WHO_AM_I); 151 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
156 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); 152 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
157} 153}
158 154
159static inline int 155static inline int is_cpu_extended(void)
160is_cpu_extended(void)
161{ 156{
162 __u8 cpu = hard_smp_processor_id(); 157 __u8 cpu = hard_smp_processor_id();
163 158
164 return(voyager_extended_vic_processors & (1<<cpu)); 159 return (voyager_extended_vic_processors & (1 << cpu));
165} 160}
166 161
167static inline int 162static inline int is_cpu_vic_boot(void)
168is_cpu_vic_boot(void)
169{ 163{
170 __u8 cpu = hard_smp_processor_id(); 164 __u8 cpu = hard_smp_processor_id();
171 165
172 return(voyager_extended_vic_processors 166 return (voyager_extended_vic_processors
173 & voyager_allowed_boot_processors & (1<<cpu)); 167 & voyager_allowed_boot_processors & (1 << cpu));
174} 168}
175 169
176 170static inline void ack_CPI(__u8 cpi)
177static inline void
178ack_CPI(__u8 cpi)
179{ 171{
180 switch(cpi) { 172 switch (cpi) {
181 case VIC_CPU_BOOT_CPI: 173 case VIC_CPU_BOOT_CPI:
182 if(is_cpu_quad() && !is_cpu_vic_boot()) 174 if (is_cpu_quad() && !is_cpu_vic_boot())
183 ack_QIC_CPI(cpi); 175 ack_QIC_CPI(cpi);
184 else 176 else
185 ack_VIC_CPI(cpi); 177 ack_VIC_CPI(cpi);
186 break; 178 break;
187 case VIC_SYS_INT: 179 case VIC_SYS_INT:
188 case VIC_CMN_INT: 180 case VIC_CMN_INT:
189 /* These are slightly strange. Even on the Quad card, 181 /* These are slightly strange. Even on the Quad card,
190 * They are vectored as VIC CPIs */ 182 * They are vectored as VIC CPIs */
191 if(is_cpu_quad()) 183 if (is_cpu_quad())
192 ack_special_QIC_CPI(cpi); 184 ack_special_QIC_CPI(cpi);
193 else 185 else
194 ack_VIC_CPI(cpi); 186 ack_VIC_CPI(cpi);
@@ -205,11 +197,11 @@ ack_CPI(__u8 cpi)
205 * 8259 IRQs except that masks and things must be kept per processor 197 * 8259 IRQs except that masks and things must be kept per processor
206 */ 198 */
207static struct irq_chip vic_chip = { 199static struct irq_chip vic_chip = {
208 .name = "VIC", 200 .name = "VIC",
209 .startup = startup_vic_irq, 201 .startup = startup_vic_irq,
210 .mask = mask_vic_irq, 202 .mask = mask_vic_irq,
211 .unmask = unmask_vic_irq, 203 .unmask = unmask_vic_irq,
212 .set_affinity = set_vic_irq_affinity, 204 .set_affinity = set_vic_irq_affinity,
213}; 205};
214 206
215/* used to count up as CPUs are brought on line (starts at 0) */ 207/* used to count up as CPUs are brought on line (starts at 0) */
@@ -223,7 +215,7 @@ static __u32 trampoline_base;
223/* The per cpu profile stuff - used in smp_local_timer_interrupt */ 215/* The per cpu profile stuff - used in smp_local_timer_interrupt */
224static DEFINE_PER_CPU(int, prof_multiplier) = 1; 216static DEFINE_PER_CPU(int, prof_multiplier) = 1;
225static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; 217static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
226static DEFINE_PER_CPU(int, prof_counter) = 1; 218static DEFINE_PER_CPU(int, prof_counter) = 1;
227 219
228/* the map used to check if a CPU has booted */ 220/* the map used to check if a CPU has booted */
229static __u32 cpu_booted_map; 221static __u32 cpu_booted_map;
@@ -235,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
235/* This is for the new dynamic CPU boot code */ 227/* This is for the new dynamic CPU boot code */
236cpumask_t cpu_callin_map = CPU_MASK_NONE; 228cpumask_t cpu_callin_map = CPU_MASK_NONE;
237cpumask_t cpu_callout_map = CPU_MASK_NONE; 229cpumask_t cpu_callout_map = CPU_MASK_NONE;
238EXPORT_SYMBOL(cpu_callout_map);
239cpumask_t cpu_possible_map = CPU_MASK_NONE; 230cpumask_t cpu_possible_map = CPU_MASK_NONE;
240EXPORT_SYMBOL(cpu_possible_map); 231EXPORT_SYMBOL(cpu_possible_map);
241 232
@@ -246,9 +237,9 @@ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
246static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; 237static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
247 238
248/* Lock for enable/disable of VIC interrupts */ 239/* Lock for enable/disable of VIC interrupts */
249static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); 240static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
250 241
251/* The boot processor is correctly set up in PC mode when it 242/* The boot processor is correctly set up in PC mode when it
252 * comes up, but the secondaries need their master/slave 8259 243 * comes up, but the secondaries need their master/slave 8259
253 * pairs initializing correctly */ 244 * pairs initializing correctly */
254 245
@@ -262,8 +253,7 @@ static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
262static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; 253static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
263 254
264/* debugging routine to read the isr of the cpu's pic */ 255/* debugging routine to read the isr of the cpu's pic */
265static inline __u16 256static inline __u16 vic_read_isr(void)
266vic_read_isr(void)
267{ 257{
268 __u16 isr; 258 __u16 isr;
269 259
@@ -275,17 +265,16 @@ vic_read_isr(void)
275 return isr; 265 return isr;
276} 266}
277 267
278static __init void 268static __init void qic_setup(void)
279qic_setup(void)
280{ 269{
281 if(!is_cpu_quad()) { 270 if (!is_cpu_quad()) {
282 /* not a quad, no setup */ 271 /* not a quad, no setup */
283 return; 272 return;
284 } 273 }
285 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); 274 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
286 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); 275 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
287 276
288 if(is_cpu_extended()) { 277 if (is_cpu_extended()) {
289 /* the QIC duplicate of the VIC base register */ 278 /* the QIC duplicate of the VIC base register */
290 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); 279 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
291 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); 280 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
@@ -295,8 +284,7 @@ qic_setup(void)
295 } 284 }
296} 285}
297 286
298static __init void 287static __init void vic_setup_pic(void)
299vic_setup_pic(void)
300{ 288{
301 outb(1, VIC_REDIRECT_REGISTER_1); 289 outb(1, VIC_REDIRECT_REGISTER_1);
302 /* clear the claim registers for dynamic routing */ 290 /* clear the claim registers for dynamic routing */
@@ -333,7 +321,7 @@ vic_setup_pic(void)
333 321
334 /* ICW2: slave vector base */ 322 /* ICW2: slave vector base */
335 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); 323 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
336 324
337 /* ICW3: slave ID */ 325 /* ICW3: slave ID */
338 outb(0x02, 0xA1); 326 outb(0x02, 0xA1);
339 327
@@ -341,19 +329,18 @@ vic_setup_pic(void)
341 outb(0x01, 0xA1); 329 outb(0x01, 0xA1);
342} 330}
343 331
344static void 332static void do_quad_bootstrap(void)
345do_quad_bootstrap(void)
346{ 333{
347 if(is_cpu_quad() && is_cpu_vic_boot()) { 334 if (is_cpu_quad() && is_cpu_vic_boot()) {
348 int i; 335 int i;
349 unsigned long flags; 336 unsigned long flags;
350 __u8 cpuid = hard_smp_processor_id(); 337 __u8 cpuid = hard_smp_processor_id();
351 338
352 local_irq_save(flags); 339 local_irq_save(flags);
353 340
354 for(i = 0; i<4; i++) { 341 for (i = 0; i < 4; i++) {
355 /* FIXME: this would be >>3 &0x7 on the 32 way */ 342 /* FIXME: this would be >>3 &0x7 on the 32 way */
356 if(((cpuid >> 2) & 0x03) == i) 343 if (((cpuid >> 2) & 0x03) == i)
357 /* don't lower our own mask! */ 344 /* don't lower our own mask! */
358 continue; 345 continue;
359 346
@@ -368,12 +355,10 @@ do_quad_bootstrap(void)
368 } 355 }
369} 356}
370 357
371
372/* Set up all the basic stuff: read the SMP config and make all the 358/* Set up all the basic stuff: read the SMP config and make all the
373 * SMP information reflect only the boot cpu. All others will be 359 * SMP information reflect only the boot cpu. All others will be
374 * brought on-line later. */ 360 * brought on-line later. */
375void __init 361void __init find_smp_config(void)
376find_smp_config(void)
377{ 362{
378 int i; 363 int i;
379 364
@@ -382,24 +367,31 @@ find_smp_config(void)
382 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); 367 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
383 368
384 /* initialize the CPU structures (moved from smp_boot_cpus) */ 369 /* initialize the CPU structures (moved from smp_boot_cpus) */
385 for(i=0; i<NR_CPUS; i++) { 370 for (i = 0; i < NR_CPUS; i++) {
386 cpu_irq_affinity[i] = ~0; 371 cpu_irq_affinity[i] = ~0;
387 } 372 }
388 cpu_online_map = cpumask_of_cpu(boot_cpu_id); 373 cpu_online_map = cpumask_of_cpu(boot_cpu_id);
389 374
390 /* The boot CPU must be extended */ 375 /* The boot CPU must be extended */
391 voyager_extended_vic_processors = 1<<boot_cpu_id; 376 voyager_extended_vic_processors = 1 << boot_cpu_id;
392 /* initially, all of the first 8 CPUs can boot */ 377 /* initially, all of the first 8 CPUs can boot */
393 voyager_allowed_boot_processors = 0xff; 378 voyager_allowed_boot_processors = 0xff;
394 /* set up everything for just this CPU, we can alter 379 /* set up everything for just this CPU, we can alter
395 * this as we start the other CPUs later */ 380 * this as we start the other CPUs later */
396 /* now get the CPU disposition from the extended CMOS */ 381 /* now get the CPU disposition from the extended CMOS */
397 cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); 382 cpus_addr(phys_cpu_present_map)[0] =
398 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; 383 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
399 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16; 384 cpus_addr(phys_cpu_present_map)[0] |=
400 cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24; 385 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
386 cpus_addr(phys_cpu_present_map)[0] |=
387 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
388 2) << 16;
389 cpus_addr(phys_cpu_present_map)[0] |=
390 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
391 3) << 24;
401 cpu_possible_map = phys_cpu_present_map; 392 cpu_possible_map = phys_cpu_present_map;
402 printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]); 393 printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n",
394 cpus_addr(phys_cpu_present_map)[0]);
403 /* Here we set up the VIC to enable SMP */ 395 /* Here we set up the VIC to enable SMP */
404 /* enable the CPIs by writing the base vector to their register */ 396 /* enable the CPIs by writing the base vector to their register */
405 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); 397 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
@@ -427,8 +419,7 @@ find_smp_config(void)
427/* 419/*
428 * The bootstrap kernel entry code has set these up. Save them 420 * The bootstrap kernel entry code has set these up. Save them
429 * for a given CPU, id is physical */ 421 * for a given CPU, id is physical */
430void __init 422void __init smp_store_cpu_info(int id)
431smp_store_cpu_info(int id)
432{ 423{
433 struct cpuinfo_x86 *c = &cpu_data(id); 424 struct cpuinfo_x86 *c = &cpu_data(id);
434 425
@@ -438,21 +429,19 @@ smp_store_cpu_info(int id)
438} 429}
439 430
440/* set up the trampoline and return the physical address of the code */ 431/* set up the trampoline and return the physical address of the code */
441static __u32 __init 432static __u32 __init setup_trampoline(void)
442setup_trampoline(void)
443{ 433{
444 /* these two are global symbols in trampoline.S */ 434 /* these two are global symbols in trampoline.S */
445 extern const __u8 trampoline_end[]; 435 extern const __u8 trampoline_end[];
446 extern const __u8 trampoline_data[]; 436 extern const __u8 trampoline_data[];
447 437
448 memcpy((__u8 *)trampoline_base, trampoline_data, 438 memcpy((__u8 *) trampoline_base, trampoline_data,
449 trampoline_end - trampoline_data); 439 trampoline_end - trampoline_data);
450 return virt_to_phys((__u8 *)trampoline_base); 440 return virt_to_phys((__u8 *) trampoline_base);
451} 441}
452 442
453/* Routine initially called when a non-boot CPU is brought online */ 443/* Routine initially called when a non-boot CPU is brought online */
454static void __init 444static void __init start_secondary(void *unused)
455start_secondary(void *unused)
456{ 445{
457 __u8 cpuid = hard_smp_processor_id(); 446 __u8 cpuid = hard_smp_processor_id();
458 /* external functions not defined in the headers */ 447 /* external functions not defined in the headers */
@@ -464,17 +453,18 @@ start_secondary(void *unused)
464 ack_CPI(VIC_CPU_BOOT_CPI); 453 ack_CPI(VIC_CPU_BOOT_CPI);
465 454
466 /* setup the 8259 master slave pair belonging to this CPU --- 455 /* setup the 8259 master slave pair belonging to this CPU ---
467 * we won't actually receive any until the boot CPU 456 * we won't actually receive any until the boot CPU
468 * relinquishes it's static routing mask */ 457 * relinquishes it's static routing mask */
469 vic_setup_pic(); 458 vic_setup_pic();
470 459
471 qic_setup(); 460 qic_setup();
472 461
473 if(is_cpu_quad() && !is_cpu_vic_boot()) { 462 if (is_cpu_quad() && !is_cpu_vic_boot()) {
474 /* clear the boot CPI */ 463 /* clear the boot CPI */
475 __u8 dummy; 464 __u8 dummy;
476 465
477 dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; 466 dummy =
467 voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
478 printk("read dummy %d\n", dummy); 468 printk("read dummy %d\n", dummy);
479 } 469 }
480 470
@@ -516,7 +506,6 @@ start_secondary(void *unused)
516 cpu_idle(); 506 cpu_idle();
517} 507}
518 508
519
520/* Routine to kick start the given CPU and wait for it to report ready 509/* Routine to kick start the given CPU and wait for it to report ready
521 * (or timeout in startup). When this routine returns, the requested 510 * (or timeout in startup). When this routine returns, the requested
522 * CPU is either fully running and configured or known to be dead. 511 * CPU is either fully running and configured or known to be dead.
@@ -524,29 +513,28 @@ start_secondary(void *unused)
524 * We call this routine sequentially 1 CPU at a time, so no need for 513 * We call this routine sequentially 1 CPU at a time, so no need for
525 * locking */ 514 * locking */
526 515
527static void __init 516static void __init do_boot_cpu(__u8 cpu)
528do_boot_cpu(__u8 cpu)
529{ 517{
530 struct task_struct *idle; 518 struct task_struct *idle;
531 int timeout; 519 int timeout;
532 unsigned long flags; 520 unsigned long flags;
533 int quad_boot = (1<<cpu) & voyager_quad_processors 521 int quad_boot = (1 << cpu) & voyager_quad_processors
534 & ~( voyager_extended_vic_processors 522 & ~(voyager_extended_vic_processors
535 & voyager_allowed_boot_processors); 523 & voyager_allowed_boot_processors);
536 524
537 /* This is an area in head.S which was used to set up the 525 /* This is an area in head.S which was used to set up the
538 * initial kernel stack. We need to alter this to give the 526 * initial kernel stack. We need to alter this to give the
539 * booting CPU a new stack (taken from its idle process) */ 527 * booting CPU a new stack (taken from its idle process) */
540 extern struct { 528 extern struct {
541 __u8 *esp; 529 __u8 *sp;
542 unsigned short ss; 530 unsigned short ss;
543 } stack_start; 531 } stack_start;
544 /* This is the format of the CPI IDT gate (in real mode) which 532 /* This is the format of the CPI IDT gate (in real mode) which
545 * we're hijacking to boot the CPU */ 533 * we're hijacking to boot the CPU */
546 union IDTFormat { 534 union IDTFormat {
547 struct seg { 535 struct seg {
548 __u16 Offset; 536 __u16 Offset;
549 __u16 Segment; 537 __u16 Segment;
550 } idt; 538 } idt;
551 __u32 val; 539 __u32 val;
552 } hijack_source; 540 } hijack_source;
@@ -565,37 +553,44 @@ do_boot_cpu(__u8 cpu)
565 alternatives_smp_switch(1); 553 alternatives_smp_switch(1);
566 554
567 idle = fork_idle(cpu); 555 idle = fork_idle(cpu);
568 if(IS_ERR(idle)) 556 if (IS_ERR(idle))
569 panic("failed fork for CPU%d", cpu); 557 panic("failed fork for CPU%d", cpu);
570 idle->thread.eip = (unsigned long) start_secondary; 558 idle->thread.ip = (unsigned long)start_secondary;
571 /* init_tasks (in sched.c) is indexed logically */ 559 /* init_tasks (in sched.c) is indexed logically */
572 stack_start.esp = (void *) idle->thread.esp; 560 stack_start.sp = (void *)idle->thread.sp;
573 561
574 init_gdt(cpu); 562 init_gdt(cpu);
575 per_cpu(current_task, cpu) = idle; 563 per_cpu(current_task, cpu) = idle;
576 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 564 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
577 irq_ctx_init(cpu); 565 irq_ctx_init(cpu);
578 566
579 /* Note: Don't modify initial ss override */ 567 /* Note: Don't modify initial ss override */
580 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, 568 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
581 (unsigned long)hijack_source.val, hijack_source.idt.Segment, 569 (unsigned long)hijack_source.val, hijack_source.idt.Segment,
582 hijack_source.idt.Offset, stack_start.esp)); 570 hijack_source.idt.Offset, stack_start.sp));
583 571
584 /* init lowmem identity mapping */ 572 /* init lowmem identity mapping */
585 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 573 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
586 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 574 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
587 flush_tlb_all(); 575 flush_tlb_all();
588 576
589 if(quad_boot) { 577 if (quad_boot) {
590 printk("CPU %d: non extended Quad boot\n", cpu); 578 printk("CPU %d: non extended Quad boot\n", cpu);
591 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4); 579 hijack_vector =
580 (__u32 *)
581 phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4);
592 *hijack_vector = hijack_source.val; 582 *hijack_vector = hijack_source.val;
593 } else { 583 } else {
594 printk("CPU%d: extended VIC boot\n", cpu); 584 printk("CPU%d: extended VIC boot\n", cpu);
595 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4); 585 hijack_vector =
586 (__u32 *)
587 phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4);
596 *hijack_vector = hijack_source.val; 588 *hijack_vector = hijack_source.val;
597 /* VIC errata, may also receive interrupt at this address */ 589 /* VIC errata, may also receive interrupt at this address */
598 hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4); 590 hijack_vector =
591 (__u32 *)
592 phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI +
593 VIC_DEFAULT_CPI_BASE) * 4);
599 *hijack_vector = hijack_source.val; 594 *hijack_vector = hijack_source.val;
600 } 595 }
601 /* All non-boot CPUs start with interrupts fully masked. Need 596 /* All non-boot CPUs start with interrupts fully masked. Need
@@ -603,73 +598,76 @@ do_boot_cpu(__u8 cpu)
603 * this in the VIC by masquerading as the processor we're 598 * this in the VIC by masquerading as the processor we're
604 * about to boot and lowering its interrupt mask */ 599 * about to boot and lowering its interrupt mask */
605 local_irq_save(flags); 600 local_irq_save(flags);
606 if(quad_boot) { 601 if (quad_boot) {
607 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); 602 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
608 } else { 603 } else {
609 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); 604 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
610 /* here we're altering registers belonging to `cpu' */ 605 /* here we're altering registers belonging to `cpu' */
611 606
612 outb(VIC_BOOT_INTERRUPT_MASK, 0x21); 607 outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
613 /* now go back to our original identity */ 608 /* now go back to our original identity */
614 outb(boot_cpu_id, VIC_PROCESSOR_ID); 609 outb(boot_cpu_id, VIC_PROCESSOR_ID);
615 610
616 /* and boot the CPU */ 611 /* and boot the CPU */
617 612
618 send_CPI((1<<cpu), VIC_CPU_BOOT_CPI); 613 send_CPI((1 << cpu), VIC_CPU_BOOT_CPI);
619 } 614 }
620 cpu_booted_map = 0; 615 cpu_booted_map = 0;
621 local_irq_restore(flags); 616 local_irq_restore(flags);
622 617
623 /* now wait for it to become ready (or timeout) */ 618 /* now wait for it to become ready (or timeout) */
624 for(timeout = 0; timeout < 50000; timeout++) { 619 for (timeout = 0; timeout < 50000; timeout++) {
625 if(cpu_booted_map) 620 if (cpu_booted_map)
626 break; 621 break;
627 udelay(100); 622 udelay(100);
628 } 623 }
629 /* reset the page table */ 624 /* reset the page table */
630 zap_low_mappings(); 625 zap_low_mappings();
631 626
632 if (cpu_booted_map) { 627 if (cpu_booted_map) {
633 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", 628 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
634 cpu, smp_processor_id())); 629 cpu, smp_processor_id()));
635 630
636 printk("CPU%d: ", cpu); 631 printk("CPU%d: ", cpu);
637 print_cpu_info(&cpu_data(cpu)); 632 print_cpu_info(&cpu_data(cpu));
638 wmb(); 633 wmb();
639 cpu_set(cpu, cpu_callout_map); 634 cpu_set(cpu, cpu_callout_map);
640 cpu_set(cpu, cpu_present_map); 635 cpu_set(cpu, cpu_present_map);
641 } 636 } else {
642 else {
643 printk("CPU%d FAILED TO BOOT: ", cpu); 637 printk("CPU%d FAILED TO BOOT: ", cpu);
644 if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5) 638 if (*
639 ((volatile unsigned char *)phys_to_virt(start_phys_address))
640 == 0xA5)
645 printk("Stuck.\n"); 641 printk("Stuck.\n");
646 else 642 else
647 printk("Not responding.\n"); 643 printk("Not responding.\n");
648 644
649 cpucount--; 645 cpucount--;
650 } 646 }
651} 647}
652 648
653void __init 649void __init smp_boot_cpus(void)
654smp_boot_cpus(void)
655{ 650{
656 int i; 651 int i;
657 652
658 /* CAT BUS initialisation must be done after the memory */ 653 /* CAT BUS initialisation must be done after the memory */
659 /* FIXME: The L4 has a catbus too, it just needs to be 654 /* FIXME: The L4 has a catbus too, it just needs to be
660 * accessed in a totally different way */ 655 * accessed in a totally different way */
661 if(voyager_level == 5) { 656 if (voyager_level == 5) {
662 voyager_cat_init(); 657 voyager_cat_init();
663 658
664 /* now that the cat has probed the Voyager System Bus, sanity 659 /* now that the cat has probed the Voyager System Bus, sanity
665 * check the cpu map */ 660 * check the cpu map */
666 if( ((voyager_quad_processors | voyager_extended_vic_processors) 661 if (((voyager_quad_processors | voyager_extended_vic_processors)
667 & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) { 662 & cpus_addr(phys_cpu_present_map)[0]) !=
663 cpus_addr(phys_cpu_present_map)[0]) {
668 /* should panic */ 664 /* should panic */
669 printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n"); 665 printk("\n\n***WARNING*** "
666 "Sanity check of CPU present map FAILED\n");
670 } 667 }
671 } else if(voyager_level == 4) 668 } else if (voyager_level == 4)
672 voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0]; 669 voyager_extended_vic_processors =
670 cpus_addr(phys_cpu_present_map)[0];
673 671
674 /* this sets up the idle task to run on the current cpu */ 672 /* this sets up the idle task to run on the current cpu */
675 voyager_extended_cpus = 1; 673 voyager_extended_cpus = 1;
@@ -678,14 +676,14 @@ smp_boot_cpus(void)
678 //global_irq_holder = boot_cpu_id; 676 //global_irq_holder = boot_cpu_id;
679 677
680 /* FIXME: Need to do something about this but currently only works 678 /* FIXME: Need to do something about this but currently only works
681 * on CPUs with a tsc which none of mine have. 679 * on CPUs with a tsc which none of mine have.
682 smp_tune_scheduling(); 680 smp_tune_scheduling();
683 */ 681 */
684 smp_store_cpu_info(boot_cpu_id); 682 smp_store_cpu_info(boot_cpu_id);
685 printk("CPU%d: ", boot_cpu_id); 683 printk("CPU%d: ", boot_cpu_id);
686 print_cpu_info(&cpu_data(boot_cpu_id)); 684 print_cpu_info(&cpu_data(boot_cpu_id));
687 685
688 if(is_cpu_quad()) { 686 if (is_cpu_quad()) {
689 /* booting on a Quad CPU */ 687 /* booting on a Quad CPU */
690 printk("VOYAGER SMP: Boot CPU is Quad\n"); 688 printk("VOYAGER SMP: Boot CPU is Quad\n");
691 qic_setup(); 689 qic_setup();
@@ -697,11 +695,11 @@ smp_boot_cpus(void)
697 695
698 cpu_set(boot_cpu_id, cpu_online_map); 696 cpu_set(boot_cpu_id, cpu_online_map);
699 cpu_set(boot_cpu_id, cpu_callout_map); 697 cpu_set(boot_cpu_id, cpu_callout_map);
700 698
701 /* loop over all the extended VIC CPUs and boot them. The 699 /* loop over all the extended VIC CPUs and boot them. The
702 * Quad CPUs must be bootstrapped by their extended VIC cpu */ 700 * Quad CPUs must be bootstrapped by their extended VIC cpu */
703 for(i = 0; i < NR_CPUS; i++) { 701 for (i = 0; i < NR_CPUS; i++) {
704 if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) 702 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
705 continue; 703 continue;
706 do_boot_cpu(i); 704 do_boot_cpu(i);
707 /* This udelay seems to be needed for the Quad boots 705 /* This udelay seems to be needed for the Quad boots
@@ -715,25 +713,26 @@ smp_boot_cpus(void)
715 for (i = 0; i < NR_CPUS; i++) 713 for (i = 0; i < NR_CPUS; i++)
716 if (cpu_isset(i, cpu_online_map)) 714 if (cpu_isset(i, cpu_online_map))
717 bogosum += cpu_data(i).loops_per_jiffy; 715 bogosum += cpu_data(i).loops_per_jiffy;
718 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", 716 printk(KERN_INFO "Total of %d processors activated "
719 cpucount+1, 717 "(%lu.%02lu BogoMIPS).\n",
720 bogosum/(500000/HZ), 718 cpucount + 1, bogosum / (500000 / HZ),
721 (bogosum/(5000/HZ))%100); 719 (bogosum / (5000 / HZ)) % 100);
722 } 720 }
723 voyager_extended_cpus = hweight32(voyager_extended_vic_processors); 721 voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
724 printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus); 722 printk("VOYAGER: Extended (interrupt handling CPUs): "
723 "%d, non-extended: %d\n", voyager_extended_cpus,
724 num_booting_cpus() - voyager_extended_cpus);
725 /* that's it, switch to symmetric mode */ 725 /* that's it, switch to symmetric mode */
726 outb(0, VIC_PRIORITY_REGISTER); 726 outb(0, VIC_PRIORITY_REGISTER);
727 outb(0, VIC_CLAIM_REGISTER_0); 727 outb(0, VIC_CLAIM_REGISTER_0);
728 outb(0, VIC_CLAIM_REGISTER_1); 728 outb(0, VIC_CLAIM_REGISTER_1);
729 729
730 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); 730 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
731} 731}
732 732
733/* Reload the secondary CPUs task structure (this function does not 733/* Reload the secondary CPUs task structure (this function does not
734 * return ) */ 734 * return ) */
735void __init 735void __init initialize_secondary(void)
736initialize_secondary(void)
737{ 736{
738#if 0 737#if 0
739 // AC kernels only 738 // AC kernels only
@@ -745,11 +744,9 @@ initialize_secondary(void)
745 * basically just the stack pointer and the eip. 744 * basically just the stack pointer and the eip.
746 */ 745 */
747 746
748 asm volatile( 747 asm volatile ("movl %0,%%esp\n\t"
749 "movl %0,%%esp\n\t" 748 "jmp *%1"::"r" (current->thread.sp),
750 "jmp *%1" 749 "r"(current->thread.ip));
751 :
752 :"r" (current->thread.esp),"r" (current->thread.eip));
753} 750}
754 751
755/* handle a Voyager SYS_INT -- If we don't, the base board will 752/* handle a Voyager SYS_INT -- If we don't, the base board will
@@ -758,25 +755,23 @@ initialize_secondary(void)
758 * System interrupts occur because some problem was detected on the 755 * System interrupts occur because some problem was detected on the
759 * various busses. To find out what you have to probe all the 756 * various busses. To find out what you have to probe all the
760 * hardware via the CAT bus. FIXME: At the moment we do nothing. */ 757 * hardware via the CAT bus. FIXME: At the moment we do nothing. */
761fastcall void 758void smp_vic_sys_interrupt(struct pt_regs *regs)
762smp_vic_sys_interrupt(struct pt_regs *regs)
763{ 759{
764 ack_CPI(VIC_SYS_INT); 760 ack_CPI(VIC_SYS_INT);
765 printk("Voyager SYSTEM INTERRUPT\n"); 761 printk("Voyager SYSTEM INTERRUPT\n");
766} 762}
767 763
768/* Handle a voyager CMN_INT; These interrupts occur either because of 764/* Handle a voyager CMN_INT; These interrupts occur either because of
769 * a system status change or because a single bit memory error 765 * a system status change or because a single bit memory error
770 * occurred. FIXME: At the moment, ignore all this. */ 766 * occurred. FIXME: At the moment, ignore all this. */
771fastcall void 767void smp_vic_cmn_interrupt(struct pt_regs *regs)
772smp_vic_cmn_interrupt(struct pt_regs *regs)
773{ 768{
774 static __u8 in_cmn_int = 0; 769 static __u8 in_cmn_int = 0;
775 static DEFINE_SPINLOCK(cmn_int_lock); 770 static DEFINE_SPINLOCK(cmn_int_lock);
776 771
777 /* common ints are broadcast, so make sure we only do this once */ 772 /* common ints are broadcast, so make sure we only do this once */
778 _raw_spin_lock(&cmn_int_lock); 773 _raw_spin_lock(&cmn_int_lock);
779 if(in_cmn_int) 774 if (in_cmn_int)
780 goto unlock_end; 775 goto unlock_end;
781 776
782 in_cmn_int++; 777 in_cmn_int++;
@@ -784,12 +779,12 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
784 779
785 VDEBUG(("Voyager COMMON INTERRUPT\n")); 780 VDEBUG(("Voyager COMMON INTERRUPT\n"));
786 781
787 if(voyager_level == 5) 782 if (voyager_level == 5)
788 voyager_cat_do_common_interrupt(); 783 voyager_cat_do_common_interrupt();
789 784
790 _raw_spin_lock(&cmn_int_lock); 785 _raw_spin_lock(&cmn_int_lock);
791 in_cmn_int = 0; 786 in_cmn_int = 0;
792 unlock_end: 787 unlock_end:
793 _raw_spin_unlock(&cmn_int_lock); 788 _raw_spin_unlock(&cmn_int_lock);
794 ack_CPI(VIC_CMN_INT); 789 ack_CPI(VIC_CMN_INT);
795} 790}
@@ -797,26 +792,23 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
797/* 792/*
798 * Reschedule call back. Nothing to do, all the work is done 793 * Reschedule call back. Nothing to do, all the work is done
799 * automatically when we return from the interrupt. */ 794 * automatically when we return from the interrupt. */
800static void 795static void smp_reschedule_interrupt(void)
801smp_reschedule_interrupt(void)
802{ 796{
803 /* do nothing */ 797 /* do nothing */
804} 798}
805 799
806static struct mm_struct * flush_mm; 800static struct mm_struct *flush_mm;
807static unsigned long flush_va; 801static unsigned long flush_va;
808static DEFINE_SPINLOCK(tlbstate_lock); 802static DEFINE_SPINLOCK(tlbstate_lock);
809#define FLUSH_ALL 0xffffffff
810 803
811/* 804/*
812 * We cannot call mmdrop() because we are in interrupt context, 805 * We cannot call mmdrop() because we are in interrupt context,
813 * instead update mm->cpu_vm_mask. 806 * instead update mm->cpu_vm_mask.
814 * 807 *
815 * We need to reload %cr3 since the page tables may be going 808 * We need to reload %cr3 since the page tables may be going
816 * away from under us.. 809 * away from under us..
817 */ 810 */
818static inline void 811static inline void voyager_leave_mm(unsigned long cpu)
819leave_mm (unsigned long cpu)
820{ 812{
821 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 813 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
822 BUG(); 814 BUG();
@@ -824,12 +816,10 @@ leave_mm (unsigned long cpu)
824 load_cr3(swapper_pg_dir); 816 load_cr3(swapper_pg_dir);
825} 817}
826 818
827
828/* 819/*
829 * Invalidate call-back 820 * Invalidate call-back
830 */ 821 */
831static void 822static void smp_invalidate_interrupt(void)
832smp_invalidate_interrupt(void)
833{ 823{
834 __u8 cpu = smp_processor_id(); 824 __u8 cpu = smp_processor_id();
835 825
@@ -837,18 +827,18 @@ smp_invalidate_interrupt(void)
837 return; 827 return;
838 /* This will flood messages. Don't uncomment unless you see 828 /* This will flood messages. Don't uncomment unless you see
839 * Problems with cross cpu invalidation 829 * Problems with cross cpu invalidation
840 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", 830 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
841 smp_processor_id())); 831 smp_processor_id()));
842 */ 832 */
843 833
844 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 834 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
845 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 835 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
846 if (flush_va == FLUSH_ALL) 836 if (flush_va == TLB_FLUSH_ALL)
847 local_flush_tlb(); 837 local_flush_tlb();
848 else 838 else
849 __flush_tlb_one(flush_va); 839 __flush_tlb_one(flush_va);
850 } else 840 } else
851 leave_mm(cpu); 841 voyager_leave_mm(cpu);
852 } 842 }
853 smp_mb__before_clear_bit(); 843 smp_mb__before_clear_bit();
854 clear_bit(cpu, &smp_invalidate_needed); 844 clear_bit(cpu, &smp_invalidate_needed);
@@ -857,11 +847,10 @@ smp_invalidate_interrupt(void)
857 847
858/* All the new flush operations for 2.4 */ 848/* All the new flush operations for 2.4 */
859 849
860
861/* This routine is called with a physical cpu mask */ 850/* This routine is called with a physical cpu mask */
862static void 851static void
863voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, 852voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm,
864 unsigned long va) 853 unsigned long va)
865{ 854{
866 int stuck = 50000; 855 int stuck = 50000;
867 856
@@ -875,7 +864,7 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
875 BUG(); 864 BUG();
876 865
877 spin_lock(&tlbstate_lock); 866 spin_lock(&tlbstate_lock);
878 867
879 flush_mm = mm; 868 flush_mm = mm;
880 flush_va = va; 869 flush_va = va;
881 atomic_set_mask(cpumask, &smp_invalidate_needed); 870 atomic_set_mask(cpumask, &smp_invalidate_needed);
@@ -887,23 +876,23 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
887 876
888 while (smp_invalidate_needed) { 877 while (smp_invalidate_needed) {
889 mb(); 878 mb();
890 if(--stuck == 0) { 879 if (--stuck == 0) {
891 printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id()); 880 printk("***WARNING*** Stuck doing invalidate CPI "
881 "(CPU%d)\n", smp_processor_id());
892 break; 882 break;
893 } 883 }
894 } 884 }
895 885
896 /* Uncomment only to debug invalidation problems 886 /* Uncomment only to debug invalidation problems
897 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); 887 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
898 */ 888 */
899 889
900 flush_mm = NULL; 890 flush_mm = NULL;
901 flush_va = 0; 891 flush_va = 0;
902 spin_unlock(&tlbstate_lock); 892 spin_unlock(&tlbstate_lock);
903} 893}
904 894
905void 895void flush_tlb_current_task(void)
906flush_tlb_current_task(void)
907{ 896{
908 struct mm_struct *mm = current->mm; 897 struct mm_struct *mm = current->mm;
909 unsigned long cpu_mask; 898 unsigned long cpu_mask;
@@ -913,14 +902,12 @@ flush_tlb_current_task(void)
913 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); 902 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
914 local_flush_tlb(); 903 local_flush_tlb();
915 if (cpu_mask) 904 if (cpu_mask)
916 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 905 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
917 906
918 preempt_enable(); 907 preempt_enable();
919} 908}
920 909
921 910void flush_tlb_mm(struct mm_struct *mm)
922void
923flush_tlb_mm (struct mm_struct * mm)
924{ 911{
925 unsigned long cpu_mask; 912 unsigned long cpu_mask;
926 913
@@ -932,15 +919,15 @@ flush_tlb_mm (struct mm_struct * mm)
932 if (current->mm) 919 if (current->mm)
933 local_flush_tlb(); 920 local_flush_tlb();
934 else 921 else
935 leave_mm(smp_processor_id()); 922 voyager_leave_mm(smp_processor_id());
936 } 923 }
937 if (cpu_mask) 924 if (cpu_mask)
938 voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 925 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
939 926
940 preempt_enable(); 927 preempt_enable();
941} 928}
942 929
943void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) 930void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
944{ 931{
945 struct mm_struct *mm = vma->vm_mm; 932 struct mm_struct *mm = vma->vm_mm;
946 unsigned long cpu_mask; 933 unsigned long cpu_mask;
@@ -949,10 +936,10 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
949 936
950 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); 937 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
951 if (current->active_mm == mm) { 938 if (current->active_mm == mm) {
952 if(current->mm) 939 if (current->mm)
953 __flush_tlb_one(va); 940 __flush_tlb_one(va);
954 else 941 else
955 leave_mm(smp_processor_id()); 942 voyager_leave_mm(smp_processor_id());
956 } 943 }
957 944
958 if (cpu_mask) 945 if (cpu_mask)
@@ -960,21 +947,21 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
960 947
961 preempt_enable(); 948 preempt_enable();
962} 949}
950
963EXPORT_SYMBOL(flush_tlb_page); 951EXPORT_SYMBOL(flush_tlb_page);
964 952
965/* enable the requested IRQs */ 953/* enable the requested IRQs */
966static void 954static void smp_enable_irq_interrupt(void)
967smp_enable_irq_interrupt(void)
968{ 955{
969 __u8 irq; 956 __u8 irq;
970 __u8 cpu = get_cpu(); 957 __u8 cpu = get_cpu();
971 958
972 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, 959 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
973 vic_irq_enable_mask[cpu])); 960 vic_irq_enable_mask[cpu]));
974 961
975 spin_lock(&vic_irq_lock); 962 spin_lock(&vic_irq_lock);
976 for(irq = 0; irq < 16; irq++) { 963 for (irq = 0; irq < 16; irq++) {
977 if(vic_irq_enable_mask[cpu] & (1<<irq)) 964 if (vic_irq_enable_mask[cpu] & (1 << irq))
978 enable_local_vic_irq(irq); 965 enable_local_vic_irq(irq);
979 } 966 }
980 vic_irq_enable_mask[cpu] = 0; 967 vic_irq_enable_mask[cpu] = 0;
@@ -982,17 +969,16 @@ smp_enable_irq_interrupt(void)
982 969
983 put_cpu_no_resched(); 970 put_cpu_no_resched();
984} 971}
985 972
986/* 973/*
987 * CPU halt call-back 974 * CPU halt call-back
988 */ 975 */
989static void 976static void smp_stop_cpu_function(void *dummy)
990smp_stop_cpu_function(void *dummy)
991{ 977{
992 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); 978 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
993 cpu_clear(smp_processor_id(), cpu_online_map); 979 cpu_clear(smp_processor_id(), cpu_online_map);
994 local_irq_disable(); 980 local_irq_disable();
995 for(;;) 981 for (;;)
996 halt(); 982 halt();
997} 983}
998 984
@@ -1006,14 +992,13 @@ struct call_data_struct {
1006 int wait; 992 int wait;
1007}; 993};
1008 994
1009static struct call_data_struct * call_data; 995static struct call_data_struct *call_data;
1010 996
1011/* execute a thread on a new CPU. The function to be called must be 997/* execute a thread on a new CPU. The function to be called must be
1012 * previously set up. This is used to schedule a function for 998 * previously set up. This is used to schedule a function for
1013 * execution on all CPUs - set up the function then broadcast a 999 * execution on all CPUs - set up the function then broadcast a
1014 * function_interrupt CPI to come here on each CPU */ 1000 * function_interrupt CPI to come here on each CPU */
1015static void 1001static void smp_call_function_interrupt(void)
1016smp_call_function_interrupt(void)
1017{ 1002{
1018 void (*func) (void *info) = call_data->func; 1003 void (*func) (void *info) = call_data->func;
1019 void *info = call_data->info; 1004 void *info = call_data->info;
@@ -1027,16 +1012,17 @@ smp_call_function_interrupt(void)
1027 * about to execute the function 1012 * about to execute the function
1028 */ 1013 */
1029 mb(); 1014 mb();
1030 if(!test_and_clear_bit(cpu, &call_data->started)) { 1015 if (!test_and_clear_bit(cpu, &call_data->started)) {
1031 /* If the bit wasn't set, this could be a replay */ 1016 /* If the bit wasn't set, this could be a replay */
1032 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu); 1017 printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
1018 " with no call pending\n", cpu);
1033 return; 1019 return;
1034 } 1020 }
1035 /* 1021 /*
1036 * At this point the info structure may be out of scope unless wait==1 1022 * At this point the info structure may be out of scope unless wait==1
1037 */ 1023 */
1038 irq_enter(); 1024 irq_enter();
1039 (*func)(info); 1025 (*func) (info);
1040 __get_cpu_var(irq_stat).irq_call_count++; 1026 __get_cpu_var(irq_stat).irq_call_count++;
1041 irq_exit(); 1027 irq_exit();
1042 if (wait) { 1028 if (wait) {
@@ -1046,14 +1032,13 @@ smp_call_function_interrupt(void)
1046} 1032}
1047 1033
1048static int 1034static int
1049voyager_smp_call_function_mask (cpumask_t cpumask, 1035voyager_smp_call_function_mask(cpumask_t cpumask,
1050 void (*func) (void *info), void *info, 1036 void (*func) (void *info), void *info, int wait)
1051 int wait)
1052{ 1037{
1053 struct call_data_struct data; 1038 struct call_data_struct data;
1054 u32 mask = cpus_addr(cpumask)[0]; 1039 u32 mask = cpus_addr(cpumask)[0];
1055 1040
1056 mask &= ~(1<<smp_processor_id()); 1041 mask &= ~(1 << smp_processor_id());
1057 1042
1058 if (!mask) 1043 if (!mask)
1059 return 0; 1044 return 0;
@@ -1093,7 +1078,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
1093 * so we use the system clock to interrupt one processor, which in 1078 * so we use the system clock to interrupt one processor, which in
1094 * turn, broadcasts a timer CPI to all the others --- we receive that 1079 * turn, broadcasts a timer CPI to all the others --- we receive that
1095 * CPI here. We don't use this actually for counting so losing 1080 * CPI here. We don't use this actually for counting so losing
1096 * ticks doesn't matter 1081 * ticks doesn't matter
1097 * 1082 *
1098 * FIXME: For those CPUs which actually have a local APIC, we could 1083 * FIXME: For those CPUs which actually have a local APIC, we could
1099 * try to use it to trigger this interrupt instead of having to 1084 * try to use it to trigger this interrupt instead of having to
@@ -1101,8 +1086,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
1101 * no local APIC, so I can't do this 1086 * no local APIC, so I can't do this
1102 * 1087 *
1103 * This function is currently a placeholder and is unused in the code */ 1088 * This function is currently a placeholder and is unused in the code */
1104fastcall void 1089void smp_apic_timer_interrupt(struct pt_regs *regs)
1105smp_apic_timer_interrupt(struct pt_regs *regs)
1106{ 1090{
1107 struct pt_regs *old_regs = set_irq_regs(regs); 1091 struct pt_regs *old_regs = set_irq_regs(regs);
1108 wrapper_smp_local_timer_interrupt(); 1092 wrapper_smp_local_timer_interrupt();
@@ -1110,8 +1094,7 @@ smp_apic_timer_interrupt(struct pt_regs *regs)
1110} 1094}
1111 1095
1112/* All of the QUAD interrupt GATES */ 1096/* All of the QUAD interrupt GATES */
1113fastcall void 1097void smp_qic_timer_interrupt(struct pt_regs *regs)
1114smp_qic_timer_interrupt(struct pt_regs *regs)
1115{ 1098{
1116 struct pt_regs *old_regs = set_irq_regs(regs); 1099 struct pt_regs *old_regs = set_irq_regs(regs);
1117 ack_QIC_CPI(QIC_TIMER_CPI); 1100 ack_QIC_CPI(QIC_TIMER_CPI);
@@ -1119,127 +1102,112 @@ smp_qic_timer_interrupt(struct pt_regs *regs)
1119 set_irq_regs(old_regs); 1102 set_irq_regs(old_regs);
1120} 1103}
1121 1104
1122fastcall void 1105void smp_qic_invalidate_interrupt(struct pt_regs *regs)
1123smp_qic_invalidate_interrupt(struct pt_regs *regs)
1124{ 1106{
1125 ack_QIC_CPI(QIC_INVALIDATE_CPI); 1107 ack_QIC_CPI(QIC_INVALIDATE_CPI);
1126 smp_invalidate_interrupt(); 1108 smp_invalidate_interrupt();
1127} 1109}
1128 1110
1129fastcall void 1111void smp_qic_reschedule_interrupt(struct pt_regs *regs)
1130smp_qic_reschedule_interrupt(struct pt_regs *regs)
1131{ 1112{
1132 ack_QIC_CPI(QIC_RESCHEDULE_CPI); 1113 ack_QIC_CPI(QIC_RESCHEDULE_CPI);
1133 smp_reschedule_interrupt(); 1114 smp_reschedule_interrupt();
1134} 1115}
1135 1116
1136fastcall void 1117void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1137smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1138{ 1118{
1139 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); 1119 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
1140 smp_enable_irq_interrupt(); 1120 smp_enable_irq_interrupt();
1141} 1121}
1142 1122
1143fastcall void 1123void smp_qic_call_function_interrupt(struct pt_regs *regs)
1144smp_qic_call_function_interrupt(struct pt_regs *regs)
1145{ 1124{
1146 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); 1125 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
1147 smp_call_function_interrupt(); 1126 smp_call_function_interrupt();
1148} 1127}
1149 1128
1150fastcall void 1129void smp_vic_cpi_interrupt(struct pt_regs *regs)
1151smp_vic_cpi_interrupt(struct pt_regs *regs)
1152{ 1130{
1153 struct pt_regs *old_regs = set_irq_regs(regs); 1131 struct pt_regs *old_regs = set_irq_regs(regs);
1154 __u8 cpu = smp_processor_id(); 1132 __u8 cpu = smp_processor_id();
1155 1133
1156 if(is_cpu_quad()) 1134 if (is_cpu_quad())
1157 ack_QIC_CPI(VIC_CPI_LEVEL0); 1135 ack_QIC_CPI(VIC_CPI_LEVEL0);
1158 else 1136 else
1159 ack_VIC_CPI(VIC_CPI_LEVEL0); 1137 ack_VIC_CPI(VIC_CPI_LEVEL0);
1160 1138
1161 if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) 1139 if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
1162 wrapper_smp_local_timer_interrupt(); 1140 wrapper_smp_local_timer_interrupt();
1163 if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) 1141 if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
1164 smp_invalidate_interrupt(); 1142 smp_invalidate_interrupt();
1165 if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) 1143 if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
1166 smp_reschedule_interrupt(); 1144 smp_reschedule_interrupt();
1167 if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) 1145 if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
1168 smp_enable_irq_interrupt(); 1146 smp_enable_irq_interrupt();
1169 if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) 1147 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1170 smp_call_function_interrupt(); 1148 smp_call_function_interrupt();
1171 set_irq_regs(old_regs); 1149 set_irq_regs(old_regs);
1172} 1150}
1173 1151
1174static void 1152static void do_flush_tlb_all(void *info)
1175do_flush_tlb_all(void* info)
1176{ 1153{
1177 unsigned long cpu = smp_processor_id(); 1154 unsigned long cpu = smp_processor_id();
1178 1155
1179 __flush_tlb_all(); 1156 __flush_tlb_all();
1180 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) 1157 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
1181 leave_mm(cpu); 1158 voyager_leave_mm(cpu);
1182} 1159}
1183 1160
1184
1185/* flush the TLB of every active CPU in the system */ 1161/* flush the TLB of every active CPU in the system */
1186void 1162void flush_tlb_all(void)
1187flush_tlb_all(void)
1188{ 1163{
1189 on_each_cpu(do_flush_tlb_all, 0, 1, 1); 1164 on_each_cpu(do_flush_tlb_all, 0, 1, 1);
1190} 1165}
1191 1166
1192/* used to set up the trampoline for other CPUs when the memory manager 1167/* used to set up the trampoline for other CPUs when the memory manager
1193 * is sorted out */ 1168 * is sorted out */
1194void __init 1169void __init smp_alloc_memory(void)
1195smp_alloc_memory(void)
1196{ 1170{
1197 trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE); 1171 trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE);
1198 if(__pa(trampoline_base) >= 0x93000) 1172 if (__pa(trampoline_base) >= 0x93000)
1199 BUG(); 1173 BUG();
1200} 1174}
1201 1175
1202/* send a reschedule CPI to one CPU by physical CPU number*/ 1176/* send a reschedule CPI to one CPU by physical CPU number*/
1203static void 1177static void voyager_smp_send_reschedule(int cpu)
1204voyager_smp_send_reschedule(int cpu)
1205{ 1178{
1206 send_one_CPI(cpu, VIC_RESCHEDULE_CPI); 1179 send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
1207} 1180}
1208 1181
1209 1182int hard_smp_processor_id(void)
1210int
1211hard_smp_processor_id(void)
1212{ 1183{
1213 __u8 i; 1184 __u8 i;
1214 __u8 cpumask = inb(VIC_PROC_WHO_AM_I); 1185 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
1215 if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) 1186 if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
1216 return cpumask & 0x1F; 1187 return cpumask & 0x1F;
1217 1188
1218 for(i = 0; i < 8; i++) { 1189 for (i = 0; i < 8; i++) {
1219 if(cpumask & (1<<i)) 1190 if (cpumask & (1 << i))
1220 return i; 1191 return i;
1221 } 1192 }
1222 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); 1193 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
1223 return 0; 1194 return 0;
1224} 1195}
1225 1196
1226int 1197int safe_smp_processor_id(void)
1227safe_smp_processor_id(void)
1228{ 1198{
1229 return hard_smp_processor_id(); 1199 return hard_smp_processor_id();
1230} 1200}
1231 1201
1232/* broadcast a halt to all other CPUs */ 1202/* broadcast a halt to all other CPUs */
1233static void 1203static void voyager_smp_send_stop(void)
1234voyager_smp_send_stop(void)
1235{ 1204{
1236 smp_call_function(smp_stop_cpu_function, NULL, 1, 1); 1205 smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
1237} 1206}
1238 1207
1239/* this function is triggered in time.c when a clock tick fires 1208/* this function is triggered in time.c when a clock tick fires
1240 * we need to re-broadcast the tick to all CPUs */ 1209 * we need to re-broadcast the tick to all CPUs */
1241void 1210void smp_vic_timer_interrupt(void)
1242smp_vic_timer_interrupt(void)
1243{ 1211{
1244 send_CPI_allbutself(VIC_TIMER_CPI); 1212 send_CPI_allbutself(VIC_TIMER_CPI);
1245 smp_local_timer_interrupt(); 1213 smp_local_timer_interrupt();
@@ -1253,8 +1221,7 @@ smp_vic_timer_interrupt(void)
1253 * multiplier is 1 and it can be changed by writing the new multiplier 1221 * multiplier is 1 and it can be changed by writing the new multiplier
1254 * value into /proc/profile. 1222 * value into /proc/profile.
1255 */ 1223 */
1256void 1224void smp_local_timer_interrupt(void)
1257smp_local_timer_interrupt(void)
1258{ 1225{
1259 int cpu = smp_processor_id(); 1226 int cpu = smp_processor_id();
1260 long weight; 1227 long weight;
@@ -1269,18 +1236,18 @@ smp_local_timer_interrupt(void)
1269 * 1236 *
1270 * Interrupts are already masked off at this point. 1237 * Interrupts are already masked off at this point.
1271 */ 1238 */
1272 per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu); 1239 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
1273 if (per_cpu(prof_counter, cpu) != 1240 if (per_cpu(prof_counter, cpu) !=
1274 per_cpu(prof_old_multiplier, cpu)) { 1241 per_cpu(prof_old_multiplier, cpu)) {
1275 /* FIXME: need to update the vic timer tick here */ 1242 /* FIXME: need to update the vic timer tick here */
1276 per_cpu(prof_old_multiplier, cpu) = 1243 per_cpu(prof_old_multiplier, cpu) =
1277 per_cpu(prof_counter, cpu); 1244 per_cpu(prof_counter, cpu);
1278 } 1245 }
1279 1246
1280 update_process_times(user_mode_vm(get_irq_regs())); 1247 update_process_times(user_mode_vm(get_irq_regs()));
1281 } 1248 }
1282 1249
1283 if( ((1<<cpu) & voyager_extended_vic_processors) == 0) 1250 if (((1 << cpu) & voyager_extended_vic_processors) == 0)
1284 /* only extended VIC processors participate in 1251 /* only extended VIC processors participate in
1285 * interrupt distribution */ 1252 * interrupt distribution */
1286 return; 1253 return;
@@ -1296,12 +1263,12 @@ smp_local_timer_interrupt(void)
1296 * we can take more than 100K local irqs per second on a 100 MHz P5. 1263 * we can take more than 100K local irqs per second on a 100 MHz P5.
1297 */ 1264 */
1298 1265
1299 if((++vic_tick[cpu] & 0x7) != 0) 1266 if ((++vic_tick[cpu] & 0x7) != 0)
1300 return; 1267 return;
1301 /* get here every 16 ticks (about every 1/6 of a second) */ 1268 /* get here every 16 ticks (about every 1/6 of a second) */
1302 1269
1303 /* Change our priority to give someone else a chance at getting 1270 /* Change our priority to give someone else a chance at getting
1304 * the IRQ. The algorithm goes like this: 1271 * the IRQ. The algorithm goes like this:
1305 * 1272 *
1306 * In the VIC, the dynamically routed interrupt is always 1273 * In the VIC, the dynamically routed interrupt is always
1307 * handled by the lowest priority eligible (i.e. receiving 1274 * handled by the lowest priority eligible (i.e. receiving
@@ -1325,18 +1292,18 @@ smp_local_timer_interrupt(void)
1325 * affinity code since we now try to even up the interrupt 1292 * affinity code since we now try to even up the interrupt
1326 * counts when an affinity binding is keeping them on a 1293 * counts when an affinity binding is keeping them on a
1327 * particular CPU*/ 1294 * particular CPU*/
1328 weight = (vic_intr_count[cpu]*voyager_extended_cpus 1295 weight = (vic_intr_count[cpu] * voyager_extended_cpus
1329 - vic_intr_total) >> 4; 1296 - vic_intr_total) >> 4;
1330 weight += 4; 1297 weight += 4;
1331 if(weight > 7) 1298 if (weight > 7)
1332 weight = 7; 1299 weight = 7;
1333 if(weight < 0) 1300 if (weight < 0)
1334 weight = 0; 1301 weight = 0;
1335 1302
1336 outb((__u8)weight, VIC_PRIORITY_REGISTER); 1303 outb((__u8) weight, VIC_PRIORITY_REGISTER);
1337 1304
1338#ifdef VOYAGER_DEBUG 1305#ifdef VOYAGER_DEBUG
1339 if((vic_tick[cpu] & 0xFFF) == 0) { 1306 if ((vic_tick[cpu] & 0xFFF) == 0) {
1340 /* print this message roughly every 25 secs */ 1307 /* print this message roughly every 25 secs */
1341 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", 1308 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
1342 cpu, vic_tick[cpu], weight); 1309 cpu, vic_tick[cpu], weight);
@@ -1345,15 +1312,14 @@ smp_local_timer_interrupt(void)
1345} 1312}
1346 1313
1347/* setup the profiling timer */ 1314/* setup the profiling timer */
1348int 1315int setup_profiling_timer(unsigned int multiplier)
1349setup_profiling_timer(unsigned int multiplier)
1350{ 1316{
1351 int i; 1317 int i;
1352 1318
1353 if ( (!multiplier)) 1319 if ((!multiplier))
1354 return -EINVAL; 1320 return -EINVAL;
1355 1321
1356 /* 1322 /*
1357 * Set the new multiplier for each CPU. CPUs don't start using the 1323 * Set the new multiplier for each CPU. CPUs don't start using the
1358 * new values until the next timer interrupt in which they do process 1324 * new values until the next timer interrupt in which they do process
1359 * accounting. 1325 * accounting.
@@ -1367,15 +1333,13 @@ setup_profiling_timer(unsigned int multiplier)
1367/* This is a bit of a mess, but forced on us by the genirq changes 1333/* This is a bit of a mess, but forced on us by the genirq changes
1368 * there's no genirq handler that really does what voyager wants 1334 * there's no genirq handler that really does what voyager wants
1369 * so hack it up with the simple IRQ handler */ 1335 * so hack it up with the simple IRQ handler */
1370static void fastcall 1336static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1371handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1372{ 1337{
1373 before_handle_vic_irq(irq); 1338 before_handle_vic_irq(irq);
1374 handle_simple_irq(irq, desc); 1339 handle_simple_irq(irq, desc);
1375 after_handle_vic_irq(irq); 1340 after_handle_vic_irq(irq);
1376} 1341}
1377 1342
1378
1379/* The CPIs are handled in the per cpu 8259s, so they must be 1343/* The CPIs are handled in the per cpu 8259s, so they must be
1380 * enabled to be received: FIX: enabling the CPIs in the early 1344 * enabled to be received: FIX: enabling the CPIs in the early
1381 * boot sequence interferes with bug checking; enable them later 1345 * boot sequence interferes with bug checking; enable them later
@@ -1385,13 +1349,12 @@ handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1385#define QIC_SET_GATE(cpi, vector) \ 1349#define QIC_SET_GATE(cpi, vector) \
1386 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) 1350 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
1387 1351
1388void __init 1352void __init smp_intr_init(void)
1389smp_intr_init(void)
1390{ 1353{
1391 int i; 1354 int i;
1392 1355
1393 /* initialize the per cpu irq mask to all disabled */ 1356 /* initialize the per cpu irq mask to all disabled */
1394 for(i = 0; i < NR_CPUS; i++) 1357 for (i = 0; i < NR_CPUS; i++)
1395 vic_irq_mask[i] = 0xFFFF; 1358 vic_irq_mask[i] = 0xFFFF;
1396 1359
1397 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); 1360 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
@@ -1404,42 +1367,40 @@ smp_intr_init(void)
1404 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); 1367 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
1405 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); 1368 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
1406 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); 1369 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
1407
1408 1370
1409 /* now put the VIC descriptor into the first 48 IRQs 1371 /* now put the VIC descriptor into the first 48 IRQs
1410 * 1372 *
1411 * This is for later: first 16 correspond to PC IRQs; next 16 1373 * This is for later: first 16 correspond to PC IRQs; next 16
1412 * are Primary MC IRQs and final 16 are Secondary MC IRQs */ 1374 * are Primary MC IRQs and final 16 are Secondary MC IRQs */
1413 for(i = 0; i < 48; i++) 1375 for (i = 0; i < 48; i++)
1414 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq); 1376 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
1415} 1377}
1416 1378
1417/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per 1379/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
1418 * processor to receive CPI */ 1380 * processor to receive CPI */
1419static void 1381static void send_CPI(__u32 cpuset, __u8 cpi)
1420send_CPI(__u32 cpuset, __u8 cpi)
1421{ 1382{
1422 int cpu; 1383 int cpu;
1423 __u32 quad_cpuset = (cpuset & voyager_quad_processors); 1384 __u32 quad_cpuset = (cpuset & voyager_quad_processors);
1424 1385
1425 if(cpi < VIC_START_FAKE_CPI) { 1386 if (cpi < VIC_START_FAKE_CPI) {
1426 /* fake CPI are only used for booting, so send to the 1387 /* fake CPI are only used for booting, so send to the
1427 * extended quads as well---Quads must be VIC booted */ 1388 * extended quads as well---Quads must be VIC booted */
1428 outb((__u8)(cpuset), VIC_CPI_Registers[cpi]); 1389 outb((__u8) (cpuset), VIC_CPI_Registers[cpi]);
1429 return; 1390 return;
1430 } 1391 }
1431 if(quad_cpuset) 1392 if (quad_cpuset)
1432 send_QIC_CPI(quad_cpuset, cpi); 1393 send_QIC_CPI(quad_cpuset, cpi);
1433 cpuset &= ~quad_cpuset; 1394 cpuset &= ~quad_cpuset;
1434 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ 1395 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */
1435 if(cpuset == 0) 1396 if (cpuset == 0)
1436 return; 1397 return;
1437 for_each_online_cpu(cpu) { 1398 for_each_online_cpu(cpu) {
1438 if(cpuset & (1<<cpu)) 1399 if (cpuset & (1 << cpu))
1439 set_bit(cpi, &vic_cpi_mailbox[cpu]); 1400 set_bit(cpi, &vic_cpi_mailbox[cpu]);
1440 } 1401 }
1441 if(cpuset) 1402 if (cpuset)
1442 outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); 1403 outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
1443} 1404}
1444 1405
1445/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and 1406/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
@@ -1448,20 +1409,19 @@ send_CPI(__u32 cpuset, __u8 cpi)
1448 * DON'T make this inline otherwise the cache line read will be 1409 * DON'T make this inline otherwise the cache line read will be
1449 * optimised away 1410 * optimised away
1450 * */ 1411 * */
1451static int 1412static int ack_QIC_CPI(__u8 cpi)
1452ack_QIC_CPI(__u8 cpi) { 1413{
1453 __u8 cpu = hard_smp_processor_id(); 1414 __u8 cpu = hard_smp_processor_id();
1454 1415
1455 cpi &= 7; 1416 cpi &= 7;
1456 1417
1457 outb(1<<cpi, QIC_INTERRUPT_CLEAR1); 1418 outb(1 << cpi, QIC_INTERRUPT_CLEAR1);
1458 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; 1419 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
1459} 1420}
1460 1421
1461static void 1422static void ack_special_QIC_CPI(__u8 cpi)
1462ack_special_QIC_CPI(__u8 cpi)
1463{ 1423{
1464 switch(cpi) { 1424 switch (cpi) {
1465 case VIC_CMN_INT: 1425 case VIC_CMN_INT:
1466 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); 1426 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
1467 break; 1427 break;
@@ -1474,8 +1434,7 @@ ack_special_QIC_CPI(__u8 cpi)
1474} 1434}
1475 1435
1476/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ 1436/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
1477static void 1437static void ack_VIC_CPI(__u8 cpi)
1478ack_VIC_CPI(__u8 cpi)
1479{ 1438{
1480#ifdef VOYAGER_DEBUG 1439#ifdef VOYAGER_DEBUG
1481 unsigned long flags; 1440 unsigned long flags;
@@ -1484,17 +1443,17 @@ ack_VIC_CPI(__u8 cpi)
1484 1443
1485 local_irq_save(flags); 1444 local_irq_save(flags);
1486 isr = vic_read_isr(); 1445 isr = vic_read_isr();
1487 if((isr & (1<<(cpi &7))) == 0) { 1446 if ((isr & (1 << (cpi & 7))) == 0) {
1488 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); 1447 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
1489 } 1448 }
1490#endif 1449#endif
1491 /* send specific EOI; the two system interrupts have 1450 /* send specific EOI; the two system interrupts have
1492 * bit 4 set for a separate vector but behave as the 1451 * bit 4 set for a separate vector but behave as the
1493 * corresponding 3 bit intr */ 1452 * corresponding 3 bit intr */
1494 outb_p(0x60|(cpi & 7),0x20); 1453 outb_p(0x60 | (cpi & 7), 0x20);
1495 1454
1496#ifdef VOYAGER_DEBUG 1455#ifdef VOYAGER_DEBUG
1497 if((vic_read_isr() & (1<<(cpi &7))) != 0) { 1456 if ((vic_read_isr() & (1 << (cpi & 7))) != 0) {
1498 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); 1457 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
1499 } 1458 }
1500 local_irq_restore(flags); 1459 local_irq_restore(flags);
@@ -1502,12 +1461,11 @@ ack_VIC_CPI(__u8 cpi)
1502} 1461}
1503 1462
1504/* cribbed with thanks from irq.c */ 1463/* cribbed with thanks from irq.c */
1505#define __byte(x,y) (((unsigned char *)&(y))[x]) 1464#define __byte(x,y) (((unsigned char *)&(y))[x])
1506#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) 1465#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu]))
1507#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) 1466#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu]))
1508 1467
1509static unsigned int 1468static unsigned int startup_vic_irq(unsigned int irq)
1510startup_vic_irq(unsigned int irq)
1511{ 1469{
1512 unmask_vic_irq(irq); 1470 unmask_vic_irq(irq);
1513 1471
@@ -1535,13 +1493,12 @@ startup_vic_irq(unsigned int irq)
1535 * broadcast an Interrupt enable CPI which causes all other CPUs to 1493 * broadcast an Interrupt enable CPI which causes all other CPUs to
1536 * adjust their masks accordingly. */ 1494 * adjust their masks accordingly. */
1537 1495
1538static void 1496static void unmask_vic_irq(unsigned int irq)
1539unmask_vic_irq(unsigned int irq)
1540{ 1497{
1541 /* linux doesn't to processor-irq affinity, so enable on 1498 /* linux doesn't to processor-irq affinity, so enable on
1542 * all CPUs we know about */ 1499 * all CPUs we know about */
1543 int cpu = smp_processor_id(), real_cpu; 1500 int cpu = smp_processor_id(), real_cpu;
1544 __u16 mask = (1<<irq); 1501 __u16 mask = (1 << irq);
1545 __u32 processorList = 0; 1502 __u32 processorList = 0;
1546 unsigned long flags; 1503 unsigned long flags;
1547 1504
@@ -1549,78 +1506,72 @@ unmask_vic_irq(unsigned int irq)
1549 irq, cpu, cpu_irq_affinity[cpu])); 1506 irq, cpu, cpu_irq_affinity[cpu]));
1550 spin_lock_irqsave(&vic_irq_lock, flags); 1507 spin_lock_irqsave(&vic_irq_lock, flags);
1551 for_each_online_cpu(real_cpu) { 1508 for_each_online_cpu(real_cpu) {
1552 if(!(voyager_extended_vic_processors & (1<<real_cpu))) 1509 if (!(voyager_extended_vic_processors & (1 << real_cpu)))
1553 continue; 1510 continue;
1554 if(!(cpu_irq_affinity[real_cpu] & mask)) { 1511 if (!(cpu_irq_affinity[real_cpu] & mask)) {
1555 /* irq has no affinity for this CPU, ignore */ 1512 /* irq has no affinity for this CPU, ignore */
1556 continue; 1513 continue;
1557 } 1514 }
1558 if(real_cpu == cpu) { 1515 if (real_cpu == cpu) {
1559 enable_local_vic_irq(irq); 1516 enable_local_vic_irq(irq);
1560 } 1517 } else if (vic_irq_mask[real_cpu] & mask) {
1561 else if(vic_irq_mask[real_cpu] & mask) {
1562 vic_irq_enable_mask[real_cpu] |= mask; 1518 vic_irq_enable_mask[real_cpu] |= mask;
1563 processorList |= (1<<real_cpu); 1519 processorList |= (1 << real_cpu);
1564 } 1520 }
1565 } 1521 }
1566 spin_unlock_irqrestore(&vic_irq_lock, flags); 1522 spin_unlock_irqrestore(&vic_irq_lock, flags);
1567 if(processorList) 1523 if (processorList)
1568 send_CPI(processorList, VIC_ENABLE_IRQ_CPI); 1524 send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
1569} 1525}
1570 1526
1571static void 1527static void mask_vic_irq(unsigned int irq)
1572mask_vic_irq(unsigned int irq)
1573{ 1528{
1574 /* lazy disable, do nothing */ 1529 /* lazy disable, do nothing */
1575} 1530}
1576 1531
1577static void 1532static void enable_local_vic_irq(unsigned int irq)
1578enable_local_vic_irq(unsigned int irq)
1579{ 1533{
1580 __u8 cpu = smp_processor_id(); 1534 __u8 cpu = smp_processor_id();
1581 __u16 mask = ~(1 << irq); 1535 __u16 mask = ~(1 << irq);
1582 __u16 old_mask = vic_irq_mask[cpu]; 1536 __u16 old_mask = vic_irq_mask[cpu];
1583 1537
1584 vic_irq_mask[cpu] &= mask; 1538 vic_irq_mask[cpu] &= mask;
1585 if(vic_irq_mask[cpu] == old_mask) 1539 if (vic_irq_mask[cpu] == old_mask)
1586 return; 1540 return;
1587 1541
1588 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", 1542 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
1589 irq, cpu)); 1543 irq, cpu));
1590 1544
1591 if (irq & 8) { 1545 if (irq & 8) {
1592 outb_p(cached_A1(cpu),0xA1); 1546 outb_p(cached_A1(cpu), 0xA1);
1593 (void)inb_p(0xA1); 1547 (void)inb_p(0xA1);
1594 } 1548 } else {
1595 else { 1549 outb_p(cached_21(cpu), 0x21);
1596 outb_p(cached_21(cpu),0x21);
1597 (void)inb_p(0x21); 1550 (void)inb_p(0x21);
1598 } 1551 }
1599} 1552}
1600 1553
1601static void 1554static void disable_local_vic_irq(unsigned int irq)
1602disable_local_vic_irq(unsigned int irq)
1603{ 1555{
1604 __u8 cpu = smp_processor_id(); 1556 __u8 cpu = smp_processor_id();
1605 __u16 mask = (1 << irq); 1557 __u16 mask = (1 << irq);
1606 __u16 old_mask = vic_irq_mask[cpu]; 1558 __u16 old_mask = vic_irq_mask[cpu];
1607 1559
1608 if(irq == 7) 1560 if (irq == 7)
1609 return; 1561 return;
1610 1562
1611 vic_irq_mask[cpu] |= mask; 1563 vic_irq_mask[cpu] |= mask;
1612 if(old_mask == vic_irq_mask[cpu]) 1564 if (old_mask == vic_irq_mask[cpu])
1613 return; 1565 return;
1614 1566
1615 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", 1567 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
1616 irq, cpu)); 1568 irq, cpu));
1617 1569
1618 if (irq & 8) { 1570 if (irq & 8) {
1619 outb_p(cached_A1(cpu),0xA1); 1571 outb_p(cached_A1(cpu), 0xA1);
1620 (void)inb_p(0xA1); 1572 (void)inb_p(0xA1);
1621 } 1573 } else {
1622 else { 1574 outb_p(cached_21(cpu), 0x21);
1623 outb_p(cached_21(cpu),0x21);
1624 (void)inb_p(0x21); 1575 (void)inb_p(0x21);
1625 } 1576 }
1626} 1577}
@@ -1631,8 +1582,7 @@ disable_local_vic_irq(unsigned int irq)
1631 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If 1582 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If
1632 * this interrupt actually comes in, then we mask and ack here to push 1583 * this interrupt actually comes in, then we mask and ack here to push
1633 * the interrupt off to another CPU */ 1584 * the interrupt off to another CPU */
1634static void 1585static void before_handle_vic_irq(unsigned int irq)
1635before_handle_vic_irq(unsigned int irq)
1636{ 1586{
1637 irq_desc_t *desc = irq_desc + irq; 1587 irq_desc_t *desc = irq_desc + irq;
1638 __u8 cpu = smp_processor_id(); 1588 __u8 cpu = smp_processor_id();
@@ -1641,16 +1591,16 @@ before_handle_vic_irq(unsigned int irq)
1641 vic_intr_total++; 1591 vic_intr_total++;
1642 vic_intr_count[cpu]++; 1592 vic_intr_count[cpu]++;
1643 1593
1644 if(!(cpu_irq_affinity[cpu] & (1<<irq))) { 1594 if (!(cpu_irq_affinity[cpu] & (1 << irq))) {
1645 /* The irq is not in our affinity mask, push it off 1595 /* The irq is not in our affinity mask, push it off
1646 * onto another CPU */ 1596 * onto another CPU */
1647 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n", 1597 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d "
1648 irq, cpu)); 1598 "on cpu %d\n", irq, cpu));
1649 disable_local_vic_irq(irq); 1599 disable_local_vic_irq(irq);
1650 /* set IRQ_INPROGRESS to prevent the handler in irq.c from 1600 /* set IRQ_INPROGRESS to prevent the handler in irq.c from
1651 * actually calling the interrupt routine */ 1601 * actually calling the interrupt routine */
1652 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; 1602 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
1653 } else if(desc->status & IRQ_DISABLED) { 1603 } else if (desc->status & IRQ_DISABLED) {
1654 /* Damn, the interrupt actually arrived, do the lazy 1604 /* Damn, the interrupt actually arrived, do the lazy
1655 * disable thing. The interrupt routine in irq.c will 1605 * disable thing. The interrupt routine in irq.c will
1656 * not handle a IRQ_DISABLED interrupt, so nothing more 1606 * not handle a IRQ_DISABLED interrupt, so nothing more
@@ -1667,8 +1617,7 @@ before_handle_vic_irq(unsigned int irq)
1667} 1617}
1668 1618
1669/* Finish the VIC interrupt: basically mask */ 1619/* Finish the VIC interrupt: basically mask */
1670static void 1620static void after_handle_vic_irq(unsigned int irq)
1671after_handle_vic_irq(unsigned int irq)
1672{ 1621{
1673 irq_desc_t *desc = irq_desc + irq; 1622 irq_desc_t *desc = irq_desc + irq;
1674 1623
@@ -1685,11 +1634,11 @@ after_handle_vic_irq(unsigned int irq)
1685#ifdef VOYAGER_DEBUG 1634#ifdef VOYAGER_DEBUG
1686 /* DEBUG: before we ack, check what's in progress */ 1635 /* DEBUG: before we ack, check what's in progress */
1687 isr = vic_read_isr(); 1636 isr = vic_read_isr();
1688 if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) { 1637 if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) {
1689 int i; 1638 int i;
1690 __u8 cpu = smp_processor_id(); 1639 __u8 cpu = smp_processor_id();
1691 __u8 real_cpu; 1640 __u8 real_cpu;
1692 int mask; /* Um... initialize me??? --RR */ 1641 int mask; /* Um... initialize me??? --RR */
1693 1642
1694 printk("VOYAGER SMP: CPU%d lost interrupt %d\n", 1643 printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
1695 cpu, irq); 1644 cpu, irq);
@@ -1698,9 +1647,10 @@ after_handle_vic_irq(unsigned int irq)
1698 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, 1647 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
1699 VIC_PROCESSOR_ID); 1648 VIC_PROCESSOR_ID);
1700 isr = vic_read_isr(); 1649 isr = vic_read_isr();
1701 if(isr & (1<<irq)) { 1650 if (isr & (1 << irq)) {
1702 printk("VOYAGER SMP: CPU%d ack irq %d\n", 1651 printk
1703 real_cpu, irq); 1652 ("VOYAGER SMP: CPU%d ack irq %d\n",
1653 real_cpu, irq);
1704 ack_vic_irq(irq); 1654 ack_vic_irq(irq);
1705 } 1655 }
1706 outb(cpu, VIC_PROCESSOR_ID); 1656 outb(cpu, VIC_PROCESSOR_ID);
@@ -1711,7 +1661,7 @@ after_handle_vic_irq(unsigned int irq)
1711 * receipt by another CPU so everything must be in 1661 * receipt by another CPU so everything must be in
1712 * order here */ 1662 * order here */
1713 ack_vic_irq(irq); 1663 ack_vic_irq(irq);
1714 if(status & IRQ_REPLAY) { 1664 if (status & IRQ_REPLAY) {
1715 /* replay is set if we disable the interrupt 1665 /* replay is set if we disable the interrupt
1716 * in the before_handle_vic_irq() routine, so 1666 * in the before_handle_vic_irq() routine, so
1717 * clear the in progress bit here to allow the 1667 * clear the in progress bit here to allow the
@@ -1720,9 +1670,9 @@ after_handle_vic_irq(unsigned int irq)
1720 } 1670 }
1721#ifdef VOYAGER_DEBUG 1671#ifdef VOYAGER_DEBUG
1722 isr = vic_read_isr(); 1672 isr = vic_read_isr();
1723 if((isr & (1<<irq)) != 0) 1673 if ((isr & (1 << irq)) != 0)
1724 printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n", 1674 printk("VOYAGER SMP: after_handle_vic_irq() after "
1725 irq, isr); 1675 "ack irq=%d, isr=0x%x\n", irq, isr);
1726#endif /* VOYAGER_DEBUG */ 1676#endif /* VOYAGER_DEBUG */
1727 } 1677 }
1728 _raw_spin_unlock(&vic_irq_lock); 1678 _raw_spin_unlock(&vic_irq_lock);
@@ -1731,7 +1681,6 @@ after_handle_vic_irq(unsigned int irq)
1731 * may be intercepted by another CPU if reasserted */ 1681 * may be intercepted by another CPU if reasserted */
1732} 1682}
1733 1683
1734
1735/* Linux processor - interrupt affinity manipulations. 1684/* Linux processor - interrupt affinity manipulations.
1736 * 1685 *
1737 * For each processor, we maintain a 32 bit irq affinity mask. 1686 * For each processor, we maintain a 32 bit irq affinity mask.
@@ -1748,8 +1697,7 @@ after_handle_vic_irq(unsigned int irq)
1748 * change the mask and then do an interrupt enable CPI to re-enable on 1697 * change the mask and then do an interrupt enable CPI to re-enable on
1749 * the selected processors */ 1698 * the selected processors */
1750 1699
1751void 1700void set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1752set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1753{ 1701{
1754 /* Only extended processors handle interrupts */ 1702 /* Only extended processors handle interrupts */
1755 unsigned long real_mask; 1703 unsigned long real_mask;
@@ -1757,13 +1705,13 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1757 int cpu; 1705 int cpu;
1758 1706
1759 real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; 1707 real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
1760 1708
1761 if(cpus_addr(mask)[0] == 0) 1709 if (cpus_addr(mask)[0] == 0)
1762 /* can't have no CPUs to accept the interrupt -- extremely 1710 /* can't have no CPUs to accept the interrupt -- extremely
1763 * bad things will happen */ 1711 * bad things will happen */
1764 return; 1712 return;
1765 1713
1766 if(irq == 0) 1714 if (irq == 0)
1767 /* can't change the affinity of the timer IRQ. This 1715 /* can't change the affinity of the timer IRQ. This
1768 * is due to the constraint in the voyager 1716 * is due to the constraint in the voyager
1769 * architecture that the CPI also comes in on and IRQ 1717 * architecture that the CPI also comes in on and IRQ
@@ -1772,7 +1720,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1772 * will no-longer be able to accept VIC CPIs */ 1720 * will no-longer be able to accept VIC CPIs */
1773 return; 1721 return;
1774 1722
1775 if(irq >= 32) 1723 if (irq >= 32)
1776 /* You can only have 32 interrupts in a voyager system 1724 /* You can only have 32 interrupts in a voyager system
1777 * (and 32 only if you have a secondary microchannel 1725 * (and 32 only if you have a secondary microchannel
1778 * bus) */ 1726 * bus) */
@@ -1780,8 +1728,8 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1780 1728
1781 for_each_online_cpu(cpu) { 1729 for_each_online_cpu(cpu) {
1782 unsigned long cpu_mask = 1 << cpu; 1730 unsigned long cpu_mask = 1 << cpu;
1783 1731
1784 if(cpu_mask & real_mask) { 1732 if (cpu_mask & real_mask) {
1785 /* enable the interrupt for this cpu */ 1733 /* enable the interrupt for this cpu */
1786 cpu_irq_affinity[cpu] |= irq_mask; 1734 cpu_irq_affinity[cpu] |= irq_mask;
1787 } else { 1735 } else {
@@ -1800,25 +1748,23 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
1800 unmask_vic_irq(irq); 1748 unmask_vic_irq(irq);
1801} 1749}
1802 1750
1803static void 1751static void ack_vic_irq(unsigned int irq)
1804ack_vic_irq(unsigned int irq)
1805{ 1752{
1806 if (irq & 8) { 1753 if (irq & 8) {
1807 outb(0x62,0x20); /* Specific EOI to cascade */ 1754 outb(0x62, 0x20); /* Specific EOI to cascade */
1808 outb(0x60|(irq & 7),0xA0); 1755 outb(0x60 | (irq & 7), 0xA0);
1809 } else { 1756 } else {
1810 outb(0x60 | (irq & 7),0x20); 1757 outb(0x60 | (irq & 7), 0x20);
1811 } 1758 }
1812} 1759}
1813 1760
1814/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 1761/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259
1815 * but are not vectored by it. This means that the 8259 mask must be 1762 * but are not vectored by it. This means that the 8259 mask must be
1816 * lowered to receive them */ 1763 * lowered to receive them */
1817static __init void 1764static __init void vic_enable_cpi(void)
1818vic_enable_cpi(void)
1819{ 1765{
1820 __u8 cpu = smp_processor_id(); 1766 __u8 cpu = smp_processor_id();
1821 1767
1822 /* just take a copy of the current mask (nop for boot cpu) */ 1768 /* just take a copy of the current mask (nop for boot cpu) */
1823 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; 1769 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
1824 1770
@@ -1827,7 +1773,7 @@ vic_enable_cpi(void)
1827 /* for sys int and cmn int */ 1773 /* for sys int and cmn int */
1828 enable_local_vic_irq(7); 1774 enable_local_vic_irq(7);
1829 1775
1830 if(is_cpu_quad()) { 1776 if (is_cpu_quad()) {
1831 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); 1777 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
1832 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); 1778 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
1833 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", 1779 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
@@ -1838,8 +1784,7 @@ vic_enable_cpi(void)
1838 cpu, vic_irq_mask[cpu])); 1784 cpu, vic_irq_mask[cpu]));
1839} 1785}
1840 1786
1841void 1787void voyager_smp_dump()
1842voyager_smp_dump()
1843{ 1788{
1844 int old_cpu = smp_processor_id(), cpu; 1789 int old_cpu = smp_processor_id(), cpu;
1845 1790
@@ -1865,10 +1810,10 @@ voyager_smp_dump()
1865 cpu, vic_irq_mask[cpu], imr, irr, isr); 1810 cpu, vic_irq_mask[cpu], imr, irr, isr);
1866#if 0 1811#if 0
1867 /* These lines are put in to try to unstick an un ack'd irq */ 1812 /* These lines are put in to try to unstick an un ack'd irq */
1868 if(isr != 0) { 1813 if (isr != 0) {
1869 int irq; 1814 int irq;
1870 for(irq=0; irq<16; irq++) { 1815 for (irq = 0; irq < 16; irq++) {
1871 if(isr & (1<<irq)) { 1816 if (isr & (1 << irq)) {
1872 printk("\tCPU%d: ack irq %d\n", 1817 printk("\tCPU%d: ack irq %d\n",
1873 cpu, irq); 1818 cpu, irq);
1874 local_irq_save(flags); 1819 local_irq_save(flags);
@@ -1884,17 +1829,15 @@ voyager_smp_dump()
1884 } 1829 }
1885} 1830}
1886 1831
1887void 1832void smp_voyager_power_off(void *dummy)
1888smp_voyager_power_off(void *dummy)
1889{ 1833{
1890 if(smp_processor_id() == boot_cpu_id) 1834 if (smp_processor_id() == boot_cpu_id)
1891 voyager_power_off(); 1835 voyager_power_off();
1892 else 1836 else
1893 smp_stop_cpu_function(NULL); 1837 smp_stop_cpu_function(NULL);
1894} 1838}
1895 1839
1896static void __init 1840static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1897voyager_smp_prepare_cpus(unsigned int max_cpus)
1898{ 1841{
1899 /* FIXME: ignore max_cpus for now */ 1842 /* FIXME: ignore max_cpus for now */
1900 smp_boot_cpus(); 1843 smp_boot_cpus();
@@ -1911,8 +1854,7 @@ static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1911 cpu_set(smp_processor_id(), cpu_present_map); 1854 cpu_set(smp_processor_id(), cpu_present_map);
1912} 1855}
1913 1856
1914static int __cpuinit 1857static int __cpuinit voyager_cpu_up(unsigned int cpu)
1915voyager_cpu_up(unsigned int cpu)
1916{ 1858{
1917 /* This only works at boot for x86. See "rewrite" above. */ 1859 /* This only works at boot for x86. See "rewrite" above. */
1918 if (cpu_isset(cpu, smp_commenced_mask)) 1860 if (cpu_isset(cpu, smp_commenced_mask))
@@ -1928,14 +1870,12 @@ voyager_cpu_up(unsigned int cpu)
1928 return 0; 1870 return 0;
1929} 1871}
1930 1872
1931static void __init 1873static void __init voyager_smp_cpus_done(unsigned int max_cpus)
1932voyager_smp_cpus_done(unsigned int max_cpus)
1933{ 1874{
1934 zap_low_mappings(); 1875 zap_low_mappings();
1935} 1876}
1936 1877
1937void __init 1878void __init smp_setup_processor_id(void)
1938smp_setup_processor_id(void)
1939{ 1879{
1940 current_thread_info()->cpu = hard_smp_processor_id(); 1880 current_thread_info()->cpu = hard_smp_processor_id();
1941 x86_write_percpu(cpu_number, hard_smp_processor_id()); 1881 x86_write_percpu(cpu_number, hard_smp_processor_id());
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
index 50f9366c411e..c69c931818ed 100644
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -30,12 +30,10 @@
30#include <asm/mtrr.h> 30#include <asm/mtrr.h>
31#include <asm/msr.h> 31#include <asm/msr.h>
32 32
33
34struct task_struct *voyager_thread; 33struct task_struct *voyager_thread;
35static __u8 set_timeout; 34static __u8 set_timeout;
36 35
37static int 36static int execute(const char *string)
38execute(const char *string)
39{ 37{
40 int ret; 38 int ret;
41 39
@@ -52,48 +50,48 @@ execute(const char *string)
52 NULL, 50 NULL,
53 }; 51 };
54 52
55 if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { 53 if ((ret =
56 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", 54 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
57 string, ret); 55 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string,
56 ret);
58 } 57 }
59 return ret; 58 return ret;
60} 59}
61 60
62static void 61static void check_from_kernel(void)
63check_from_kernel(void)
64{ 62{
65 if(voyager_status.switch_off) { 63 if (voyager_status.switch_off) {
66 64
67 /* FIXME: This should be configurable via proc */ 65 /* FIXME: This should be configurable via proc */
68 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); 66 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
69 } else if(voyager_status.power_fail) { 67 } else if (voyager_status.power_fail) {
70 VDEBUG(("Voyager daemon detected AC power failure\n")); 68 VDEBUG(("Voyager daemon detected AC power failure\n"));
71 69
72 /* FIXME: This should be configureable via proc */ 70 /* FIXME: This should be configureable via proc */
73 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); 71 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
74 set_timeout = 1; 72 set_timeout = 1;
75 } 73 }
76} 74}
77 75
78static void 76static void check_continuing_condition(void)
79check_continuing_condition(void)
80{ 77{
81 if(voyager_status.power_fail) { 78 if (voyager_status.power_fail) {
82 __u8 data; 79 __u8 data;
83 voyager_cat_psi(VOYAGER_PSI_SUBREAD, 80 voyager_cat_psi(VOYAGER_PSI_SUBREAD,
84 VOYAGER_PSI_AC_FAIL_REG, &data); 81 VOYAGER_PSI_AC_FAIL_REG, &data);
85 if((data & 0x1f) == 0) { 82 if ((data & 0x1f) == 0) {
86 /* all power restored */ 83 /* all power restored */
87 printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n"); 84 printk(KERN_NOTICE
85 "VOYAGER AC power restored, cancelling shutdown\n");
88 /* FIXME: should be user configureable */ 86 /* FIXME: should be user configureable */
89 execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); 87 execute
88 ("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
90 set_timeout = 0; 89 set_timeout = 0;
91 } 90 }
92 } 91 }
93} 92}
94 93
95static int 94static int thread(void *unused)
96thread(void *unused)
97{ 95{
98 printk(KERN_NOTICE "Voyager starting monitor thread\n"); 96 printk(KERN_NOTICE "Voyager starting monitor thread\n");
99 97
@@ -102,7 +100,7 @@ thread(void *unused)
102 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT); 100 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
103 101
104 VDEBUG(("Voyager Daemon awoken\n")); 102 VDEBUG(("Voyager Daemon awoken\n"));
105 if(voyager_status.request_from_kernel == 0) { 103 if (voyager_status.request_from_kernel == 0) {
106 /* probably awoken from timeout */ 104 /* probably awoken from timeout */
107 check_continuing_condition(); 105 check_continuing_condition();
108 } else { 106 } else {
@@ -112,20 +110,18 @@ thread(void *unused)
112 } 110 }
113} 111}
114 112
115static int __init 113static int __init voyager_thread_start(void)
116voyager_thread_start(void)
117{ 114{
118 voyager_thread = kthread_run(thread, NULL, "kvoyagerd"); 115 voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
119 if (IS_ERR(voyager_thread)) { 116 if (IS_ERR(voyager_thread)) {
120 printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n"); 117 printk(KERN_ERR
118 "Voyager: Failed to create system monitor thread.\n");
121 return PTR_ERR(voyager_thread); 119 return PTR_ERR(voyager_thread);
122 } 120 }
123 return 0; 121 return 0;
124} 122}
125 123
126 124static void __exit voyager_thread_stop(void)
127static void __exit
128voyager_thread_stop(void)
129{ 125{
130 kthread_stop(voyager_thread); 126 kthread_stop(voyager_thread);
131} 127}
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
index a1b0d22f6978..59d353d2c599 100644
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -33,45 +33,41 @@
33#undef PRINT_MESSAGES 33#undef PRINT_MESSAGES
34/* */ 34/* */
35 35
36
37#if 0 36#if 0
38void Un_impl(void) 37void Un_impl(void)
39{ 38{
40 u_char byte1, FPU_modrm; 39 u_char byte1, FPU_modrm;
41 unsigned long address = FPU_ORIG_EIP; 40 unsigned long address = FPU_ORIG_EIP;
42 41
43 RE_ENTRANT_CHECK_OFF; 42 RE_ENTRANT_CHECK_OFF;
44 /* No need to check access_ok(), we have previously fetched these bytes. */ 43 /* No need to check access_ok(), we have previously fetched these bytes. */
45 printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address); 44 printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address);
46 if ( FPU_CS == __USER_CS ) 45 if (FPU_CS == __USER_CS) {
47 { 46 while (1) {
48 while ( 1 ) 47 FPU_get_user(byte1, (u_char __user *) address);
49 { 48 if ((byte1 & 0xf8) == 0xd8)
50 FPU_get_user(byte1, (u_char __user *) address); 49 break;
51 if ( (byte1 & 0xf8) == 0xd8 ) break; 50 printk("[%02x]", byte1);
52 printk("[%02x]", byte1); 51 address++;
53 address++; 52 }
53 printk("%02x ", byte1);
54 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
55
56 if (FPU_modrm >= 0300)
57 printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8,
58 FPU_modrm & 7);
59 else
60 printk("/%d\n", (FPU_modrm >> 3) & 7);
61 } else {
62 printk("cs selector = %04x\n", FPU_CS);
54 } 63 }
55 printk("%02x ", byte1);
56 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
57
58 if (FPU_modrm >= 0300)
59 printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
60 else
61 printk("/%d\n", (FPU_modrm >> 3) & 7);
62 }
63 else
64 {
65 printk("cs selector = %04x\n", FPU_CS);
66 }
67
68 RE_ENTRANT_CHECK_ON;
69
70 EXCEPTION(EX_Invalid);
71 64
72} 65 RE_ENTRANT_CHECK_ON;
73#endif /* 0 */
74 66
67 EXCEPTION(EX_Invalid);
68
69}
70#endif /* 0 */
75 71
76/* 72/*
77 Called for opcodes which are illegal and which are known to result in a 73 Called for opcodes which are illegal and which are known to result in a
@@ -79,139 +75,152 @@ void Un_impl(void)
79 */ 75 */
80void FPU_illegal(void) 76void FPU_illegal(void)
81{ 77{
82 math_abort(FPU_info,SIGILL); 78 math_abort(FPU_info, SIGILL);
83} 79}
84 80
85
86
87void FPU_printall(void) 81void FPU_printall(void)
88{ 82{
89 int i; 83 int i;
90 static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", 84 static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
91 "DeNorm", "Inf", "NaN" }; 85 "DeNorm", "Inf", "NaN"
92 u_char byte1, FPU_modrm; 86 };
93 unsigned long address = FPU_ORIG_EIP; 87 u_char byte1, FPU_modrm;
94 88 unsigned long address = FPU_ORIG_EIP;
95 RE_ENTRANT_CHECK_OFF; 89
96 /* No need to check access_ok(), we have previously fetched these bytes. */ 90 RE_ENTRANT_CHECK_OFF;
97 printk("At %p:", (void *) address); 91 /* No need to check access_ok(), we have previously fetched these bytes. */
98 if ( FPU_CS == __USER_CS ) 92 printk("At %p:", (void *)address);
99 { 93 if (FPU_CS == __USER_CS) {
100#define MAX_PRINTED_BYTES 20 94#define MAX_PRINTED_BYTES 20
101 for ( i = 0; i < MAX_PRINTED_BYTES; i++ ) 95 for (i = 0; i < MAX_PRINTED_BYTES; i++) {
102 { 96 FPU_get_user(byte1, (u_char __user *) address);
103 FPU_get_user(byte1, (u_char __user *) address); 97 if ((byte1 & 0xf8) == 0xd8) {
104 if ( (byte1 & 0xf8) == 0xd8 ) 98 printk(" %02x", byte1);
105 { 99 break;
106 printk(" %02x", byte1); 100 }
107 break; 101 printk(" [%02x]", byte1);
108 } 102 address++;
109 printk(" [%02x]", byte1); 103 }
110 address++; 104 if (i == MAX_PRINTED_BYTES)
111 } 105 printk(" [more..]\n");
112 if ( i == MAX_PRINTED_BYTES ) 106 else {
113 printk(" [more..]\n"); 107 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
114 else 108
115 { 109 if (FPU_modrm >= 0300)
116 FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); 110 printk(" %02x (%02x+%d)\n", FPU_modrm,
117 111 FPU_modrm & 0xf8, FPU_modrm & 7);
118 if (FPU_modrm >= 0300) 112 else
119 printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); 113 printk(" /%d, mod=%d rm=%d\n",
120 else 114 (FPU_modrm >> 3) & 7,
121 printk(" /%d, mod=%d rm=%d\n", 115 (FPU_modrm >> 6) & 3, FPU_modrm & 7);
122 (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7); 116 }
117 } else {
118 printk("%04x\n", FPU_CS);
123 } 119 }
124 }
125 else
126 {
127 printk("%04x\n", FPU_CS);
128 }
129 120
130 partial_status = status_word(); 121 partial_status = status_word();
131 122
132#ifdef DEBUGGING 123#ifdef DEBUGGING
133if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n"); 124 if (partial_status & SW_Backward)
134if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n"); 125 printk("SW: backward compatibility\n");
135if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n"); 126 if (partial_status & SW_C3)
136if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n"); 127 printk("SW: condition bit 3\n");
137if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n"); 128 if (partial_status & SW_C2)
138if ( partial_status & SW_Summary ) printk("SW: exception summary\n"); 129 printk("SW: condition bit 2\n");
139if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n"); 130 if (partial_status & SW_C1)
140if ( partial_status & SW_Precision ) printk("SW: loss of precision\n"); 131 printk("SW: condition bit 1\n");
141if ( partial_status & SW_Underflow ) printk("SW: underflow\n"); 132 if (partial_status & SW_C0)
142if ( partial_status & SW_Overflow ) printk("SW: overflow\n"); 133 printk("SW: condition bit 0\n");
143if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n"); 134 if (partial_status & SW_Summary)
144if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n"); 135 printk("SW: exception summary\n");
145if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n"); 136 if (partial_status & SW_Stack_Fault)
137 printk("SW: stack fault\n");
138 if (partial_status & SW_Precision)
139 printk("SW: loss of precision\n");
140 if (partial_status & SW_Underflow)
141 printk("SW: underflow\n");
142 if (partial_status & SW_Overflow)
143 printk("SW: overflow\n");
144 if (partial_status & SW_Zero_Div)
145 printk("SW: divide by zero\n");
146 if (partial_status & SW_Denorm_Op)
147 printk("SW: denormalized operand\n");
148 if (partial_status & SW_Invalid)
149 printk("SW: invalid operation\n");
146#endif /* DEBUGGING */ 150#endif /* DEBUGGING */
147 151
148 printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", 152 printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0, /* busy */
149 partial_status & 0x8000 ? 1 : 0, /* busy */ 153 (partial_status & 0x3800) >> 11, /* stack top pointer */
150 (partial_status & 0x3800) >> 11, /* stack top pointer */ 154 partial_status & 0x80 ? 1 : 0, /* Error summary status */
151 partial_status & 0x80 ? 1 : 0, /* Error summary status */ 155 partial_status & 0x40 ? 1 : 0, /* Stack flag */
152 partial_status & 0x40 ? 1 : 0, /* Stack flag */ 156 partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0, /* cc */
153 partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */ 157 partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0, /* cc */
154 partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */ 158 partial_status & SW_Precision ? 1 : 0,
155 partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0, 159 partial_status & SW_Underflow ? 1 : 0,
156 partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0, 160 partial_status & SW_Overflow ? 1 : 0,
157 partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0); 161 partial_status & SW_Zero_Div ? 1 : 0,
158 162 partial_status & SW_Denorm_Op ? 1 : 0,
159printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n", 163 partial_status & SW_Invalid ? 1 : 0);
160 control_word & 0x1000 ? 1 : 0, 164
161 (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, 165 printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d ef=%d%d%d%d%d%d\n",
162 (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, 166 control_word & 0x1000 ? 1 : 0,
163 control_word & 0x80 ? 1 : 0, 167 (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
164 control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0, 168 (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
165 control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0, 169 control_word & 0x80 ? 1 : 0,
166 control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0); 170 control_word & SW_Precision ? 1 : 0,
167 171 control_word & SW_Underflow ? 1 : 0,
168 for ( i = 0; i < 8; i++ ) 172 control_word & SW_Overflow ? 1 : 0,
169 { 173 control_word & SW_Zero_Div ? 1 : 0,
170 FPU_REG *r = &st(i); 174 control_word & SW_Denorm_Op ? 1 : 0,
171 u_char tagi = FPU_gettagi(i); 175 control_word & SW_Invalid ? 1 : 0);
172 switch (tagi) 176
173 { 177 for (i = 0; i < 8; i++) {
174 case TAG_Empty: 178 FPU_REG *r = &st(i);
175 continue; 179 u_char tagi = FPU_gettagi(i);
176 break; 180 switch (tagi) {
177 case TAG_Zero: 181 case TAG_Empty:
178 case TAG_Special: 182 continue;
179 tagi = FPU_Special(r); 183 break;
180 case TAG_Valid: 184 case TAG_Zero:
181 printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, 185 case TAG_Special:
182 getsign(r) ? '-' : '+', 186 tagi = FPU_Special(r);
183 (long)(r->sigh >> 16), 187 case TAG_Valid:
184 (long)(r->sigh & 0xFFFF), 188 printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
185 (long)(r->sigl >> 16), 189 getsign(r) ? '-' : '+',
186 (long)(r->sigl & 0xFFFF), 190 (long)(r->sigh >> 16),
187 exponent(r) - EXP_BIAS + 1); 191 (long)(r->sigh & 0xFFFF),
188 break; 192 (long)(r->sigl >> 16),
189 default: 193 (long)(r->sigl & 0xFFFF),
190 printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); 194 exponent(r) - EXP_BIAS + 1);
191 continue; 195 break;
192 break; 196 default:
197 printk("Whoops! Error in errors.c: tag%d is %d ", i,
198 tagi);
199 continue;
200 break;
201 }
202 printk("%s\n", tag_desc[(int)(unsigned)tagi]);
193 } 203 }
194 printk("%s\n", tag_desc[(int) (unsigned) tagi]);
195 }
196 204
197 RE_ENTRANT_CHECK_ON; 205 RE_ENTRANT_CHECK_ON;
198 206
199} 207}
200 208
201static struct { 209static struct {
202 int type; 210 int type;
203 const char *name; 211 const char *name;
204} exception_names[] = { 212} exception_names[] = {
205 { EX_StackOver, "stack overflow" }, 213 {
206 { EX_StackUnder, "stack underflow" }, 214 EX_StackOver, "stack overflow"}, {
207 { EX_Precision, "loss of precision" }, 215 EX_StackUnder, "stack underflow"}, {
208 { EX_Underflow, "underflow" }, 216 EX_Precision, "loss of precision"}, {
209 { EX_Overflow, "overflow" }, 217 EX_Underflow, "underflow"}, {
210 { EX_ZeroDiv, "divide by zero" }, 218 EX_Overflow, "overflow"}, {
211 { EX_Denormal, "denormalized operand" }, 219 EX_ZeroDiv, "divide by zero"}, {
212 { EX_Invalid, "invalid operation" }, 220 EX_Denormal, "denormalized operand"}, {
213 { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION }, 221 EX_Invalid, "invalid operation"}, {
214 { 0, NULL } 222 EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, {
223 0, NULL}
215}; 224};
216 225
217/* 226/*
@@ -295,445 +304,386 @@ static struct {
295 304
296asmlinkage void FPU_exception(int n) 305asmlinkage void FPU_exception(int n)
297{ 306{
298 int i, int_type; 307 int i, int_type;
299 308
300 int_type = 0; /* Needed only to stop compiler warnings */ 309 int_type = 0; /* Needed only to stop compiler warnings */
301 if ( n & EX_INTERNAL ) 310 if (n & EX_INTERNAL) {
302 { 311 int_type = n - EX_INTERNAL;
303 int_type = n - EX_INTERNAL; 312 n = EX_INTERNAL;
304 n = EX_INTERNAL; 313 /* Set lots of exception bits! */
305 /* Set lots of exception bits! */ 314 partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
306 partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); 315 } else {
307 } 316 /* Extract only the bits which we use to set the status word */
308 else 317 n &= (SW_Exc_Mask);
309 { 318 /* Set the corresponding exception bit */
310 /* Extract only the bits which we use to set the status word */ 319 partial_status |= n;
311 n &= (SW_Exc_Mask); 320 /* Set summary bits iff exception isn't masked */
312 /* Set the corresponding exception bit */ 321 if (partial_status & ~control_word & CW_Exceptions)
313 partial_status |= n; 322 partial_status |= (SW_Summary | SW_Backward);
314 /* Set summary bits iff exception isn't masked */ 323 if (n & (SW_Stack_Fault | EX_Precision)) {
315 if ( partial_status & ~control_word & CW_Exceptions ) 324 if (!(n & SW_C1))
316 partial_status |= (SW_Summary | SW_Backward); 325 /* This bit distinguishes over- from underflow for a stack fault,
317 if ( n & (SW_Stack_Fault | EX_Precision) ) 326 and roundup from round-down for precision loss. */
318 { 327 partial_status &= ~SW_C1;
319 if ( !(n & SW_C1) ) 328 }
320 /* This bit distinguishes over- from underflow for a stack fault,
321 and roundup from round-down for precision loss. */
322 partial_status &= ~SW_C1;
323 } 329 }
324 }
325 330
326 RE_ENTRANT_CHECK_OFF; 331 RE_ENTRANT_CHECK_OFF;
327 if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) ) 332 if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
328 {
329#ifdef PRINT_MESSAGES 333#ifdef PRINT_MESSAGES
330 /* My message from the sponsor */ 334 /* My message from the sponsor */
331 printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n"); 335 printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n");
332#endif /* PRINT_MESSAGES */ 336#endif /* PRINT_MESSAGES */
333 337
334 /* Get a name string for error reporting */ 338 /* Get a name string for error reporting */
335 for (i=0; exception_names[i].type; i++) 339 for (i = 0; exception_names[i].type; i++)
336 if ( (exception_names[i].type & n) == exception_names[i].type ) 340 if ((exception_names[i].type & n) ==
337 break; 341 exception_names[i].type)
338 342 break;
339 if (exception_names[i].type) 343
340 { 344 if (exception_names[i].type) {
341#ifdef PRINT_MESSAGES 345#ifdef PRINT_MESSAGES
342 printk("FP Exception: %s!\n", exception_names[i].name); 346 printk("FP Exception: %s!\n", exception_names[i].name);
343#endif /* PRINT_MESSAGES */ 347#endif /* PRINT_MESSAGES */
344 } 348 } else
345 else 349 printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
346 printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); 350
347 351 if (n == EX_INTERNAL) {
348 if ( n == EX_INTERNAL ) 352 printk("FPU emulator: Internal error type 0x%04x\n",
349 { 353 int_type);
350 printk("FPU emulator: Internal error type 0x%04x\n", int_type); 354 FPU_printall();
351 FPU_printall(); 355 }
352 }
353#ifdef PRINT_MESSAGES 356#ifdef PRINT_MESSAGES
354 else 357 else
355 FPU_printall(); 358 FPU_printall();
356#endif /* PRINT_MESSAGES */ 359#endif /* PRINT_MESSAGES */
357 360
358 /* 361 /*
359 * The 80486 generates an interrupt on the next non-control FPU 362 * The 80486 generates an interrupt on the next non-control FPU
360 * instruction. So we need some means of flagging it. 363 * instruction. So we need some means of flagging it.
361 * We use the ES (Error Summary) bit for this. 364 * We use the ES (Error Summary) bit for this.
362 */ 365 */
363 } 366 }
364 RE_ENTRANT_CHECK_ON; 367 RE_ENTRANT_CHECK_ON;
365 368
366#ifdef __DEBUG__ 369#ifdef __DEBUG__
367 math_abort(FPU_info,SIGFPE); 370 math_abort(FPU_info, SIGFPE);
368#endif /* __DEBUG__ */ 371#endif /* __DEBUG__ */
369 372
370} 373}
371 374
372
373/* Real operation attempted on a NaN. */ 375/* Real operation attempted on a NaN. */
374/* Returns < 0 if the exception is unmasked */ 376/* Returns < 0 if the exception is unmasked */
375int real_1op_NaN(FPU_REG *a) 377int real_1op_NaN(FPU_REG *a)
376{ 378{
377 int signalling, isNaN; 379 int signalling, isNaN;
378 380
379 isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); 381 isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
380 382
381 /* The default result for the case of two "equal" NaNs (signs may 383 /* The default result for the case of two "equal" NaNs (signs may
382 differ) is chosen to reproduce 80486 behaviour */ 384 differ) is chosen to reproduce 80486 behaviour */
383 signalling = isNaN && !(a->sigh & 0x40000000); 385 signalling = isNaN && !(a->sigh & 0x40000000);
384 386
385 if ( !signalling ) 387 if (!signalling) {
386 { 388 if (!isNaN) { /* pseudo-NaN, or other unsupported? */
387 if ( !isNaN ) /* pseudo-NaN, or other unsupported? */ 389 if (control_word & CW_Invalid) {
388 { 390 /* Masked response */
389 if ( control_word & CW_Invalid ) 391 reg_copy(&CONST_QNaN, a);
390 { 392 }
391 /* Masked response */ 393 EXCEPTION(EX_Invalid);
392 reg_copy(&CONST_QNaN, a); 394 return (!(control_word & CW_Invalid) ? FPU_Exception :
393 } 395 0) | TAG_Special;
394 EXCEPTION(EX_Invalid); 396 }
395 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 397 return TAG_Special;
396 } 398 }
397 return TAG_Special;
398 }
399 399
400 if ( control_word & CW_Invalid ) 400 if (control_word & CW_Invalid) {
401 { 401 /* The masked response */
402 /* The masked response */ 402 if (!(a->sigh & 0x80000000)) { /* pseudo-NaN ? */
403 if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */ 403 reg_copy(&CONST_QNaN, a);
404 { 404 }
405 reg_copy(&CONST_QNaN, a); 405 /* ensure a Quiet NaN */
406 a->sigh |= 0x40000000;
406 } 407 }
407 /* ensure a Quiet NaN */
408 a->sigh |= 0x40000000;
409 }
410 408
411 EXCEPTION(EX_Invalid); 409 EXCEPTION(EX_Invalid);
412 410
413 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 411 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
414} 412}
415 413
416
417/* Real operation attempted on two operands, one a NaN. */ 414/* Real operation attempted on two operands, one a NaN. */
418/* Returns < 0 if the exception is unmasked */ 415/* Returns < 0 if the exception is unmasked */
419int real_2op_NaN(FPU_REG const *b, u_char tagb, 416int real_2op_NaN(FPU_REG const *b, u_char tagb,
420 int deststnr, 417 int deststnr, FPU_REG const *defaultNaN)
421 FPU_REG const *defaultNaN)
422{ 418{
423 FPU_REG *dest = &st(deststnr); 419 FPU_REG *dest = &st(deststnr);
424 FPU_REG const *a = dest; 420 FPU_REG const *a = dest;
425 u_char taga = FPU_gettagi(deststnr); 421 u_char taga = FPU_gettagi(deststnr);
426 FPU_REG const *x; 422 FPU_REG const *x;
427 int signalling, unsupported; 423 int signalling, unsupported;
428 424
429 if ( taga == TAG_Special ) 425 if (taga == TAG_Special)
430 taga = FPU_Special(a); 426 taga = FPU_Special(a);
431 if ( tagb == TAG_Special ) 427 if (tagb == TAG_Special)
432 tagb = FPU_Special(b); 428 tagb = FPU_Special(b);
433 429
434 /* TW_NaN is also used for unsupported data types. */ 430 /* TW_NaN is also used for unsupported data types. */
435 unsupported = ((taga == TW_NaN) 431 unsupported = ((taga == TW_NaN)
436 && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000))) 432 && !((exponent(a) == EXP_OVER)
437 || ((tagb == TW_NaN) 433 && (a->sigh & 0x80000000)))
438 && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); 434 || ((tagb == TW_NaN)
439 if ( unsupported ) 435 && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
440 { 436 if (unsupported) {
441 if ( control_word & CW_Invalid ) 437 if (control_word & CW_Invalid) {
442 { 438 /* Masked response */
443 /* Masked response */ 439 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
444 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); 440 }
445 } 441 EXCEPTION(EX_Invalid);
446 EXCEPTION(EX_Invalid); 442 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) |
447 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 443 TAG_Special;
448 }
449
450 if (taga == TW_NaN)
451 {
452 x = a;
453 if (tagb == TW_NaN)
454 {
455 signalling = !(a->sigh & b->sigh & 0x40000000);
456 if ( significand(b) > significand(a) )
457 x = b;
458 else if ( significand(b) == significand(a) )
459 {
460 /* The default result for the case of two "equal" NaNs (signs may
461 differ) is chosen to reproduce 80486 behaviour */
462 x = defaultNaN;
463 }
464 }
465 else
466 {
467 /* return the quiet version of the NaN in a */
468 signalling = !(a->sigh & 0x40000000);
469 } 444 }
470 } 445
471 else 446 if (taga == TW_NaN) {
447 x = a;
448 if (tagb == TW_NaN) {
449 signalling = !(a->sigh & b->sigh & 0x40000000);
450 if (significand(b) > significand(a))
451 x = b;
452 else if (significand(b) == significand(a)) {
453 /* The default result for the case of two "equal" NaNs (signs may
454 differ) is chosen to reproduce 80486 behaviour */
455 x = defaultNaN;
456 }
457 } else {
458 /* return the quiet version of the NaN in a */
459 signalling = !(a->sigh & 0x40000000);
460 }
461 } else
472#ifdef PARANOID 462#ifdef PARANOID
473 if (tagb == TW_NaN) 463 if (tagb == TW_NaN)
474#endif /* PARANOID */ 464#endif /* PARANOID */
475 { 465 {
476 signalling = !(b->sigh & 0x40000000); 466 signalling = !(b->sigh & 0x40000000);
477 x = b; 467 x = b;
478 } 468 }
479#ifdef PARANOID 469#ifdef PARANOID
480 else 470 else {
481 { 471 signalling = 0;
482 signalling = 0; 472 EXCEPTION(EX_INTERNAL | 0x113);
483 EXCEPTION(EX_INTERNAL|0x113); 473 x = &CONST_QNaN;
484 x = &CONST_QNaN; 474 }
485 }
486#endif /* PARANOID */ 475#endif /* PARANOID */
487 476
488 if ( (!signalling) || (control_word & CW_Invalid) ) 477 if ((!signalling) || (control_word & CW_Invalid)) {
489 { 478 if (!x)
490 if ( ! x ) 479 x = b;
491 x = b;
492 480
493 if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */ 481 if (!(x->sigh & 0x80000000)) /* pseudo-NaN ? */
494 x = &CONST_QNaN; 482 x = &CONST_QNaN;
495 483
496 FPU_copy_to_regi(x, TAG_Special, deststnr); 484 FPU_copy_to_regi(x, TAG_Special, deststnr);
497 485
498 if ( !signalling ) 486 if (!signalling)
499 return TAG_Special; 487 return TAG_Special;
500 488
501 /* ensure a Quiet NaN */ 489 /* ensure a Quiet NaN */
502 dest->sigh |= 0x40000000; 490 dest->sigh |= 0x40000000;
503 } 491 }
504 492
505 EXCEPTION(EX_Invalid); 493 EXCEPTION(EX_Invalid);
506 494
507 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; 495 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
508} 496}
509 497
510
511/* Invalid arith operation on Valid registers */ 498/* Invalid arith operation on Valid registers */
512/* Returns < 0 if the exception is unmasked */ 499/* Returns < 0 if the exception is unmasked */
513asmlinkage int arith_invalid(int deststnr) 500asmlinkage int arith_invalid(int deststnr)
514{ 501{
515 502
516 EXCEPTION(EX_Invalid); 503 EXCEPTION(EX_Invalid);
517
518 if ( control_word & CW_Invalid )
519 {
520 /* The masked response */
521 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
522 }
523
524 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
525 504
526} 505 if (control_word & CW_Invalid) {
506 /* The masked response */
507 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
508 }
527 509
510 return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
511
512}
528 513
529/* Divide a finite number by zero */ 514/* Divide a finite number by zero */
530asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) 515asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign)
531{ 516{
532 FPU_REG *dest = &st(deststnr); 517 FPU_REG *dest = &st(deststnr);
533 int tag = TAG_Valid; 518 int tag = TAG_Valid;
519
520 if (control_word & CW_ZeroDiv) {
521 /* The masked response */
522 FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
523 setsign(dest, sign);
524 tag = TAG_Special;
525 }
534 526
535 if ( control_word & CW_ZeroDiv ) 527 EXCEPTION(EX_ZeroDiv);
536 {
537 /* The masked response */
538 FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
539 setsign(dest, sign);
540 tag = TAG_Special;
541 }
542
543 EXCEPTION(EX_ZeroDiv);
544 528
545 return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; 529 return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
546 530
547} 531}
548 532
549
550/* This may be called often, so keep it lean */ 533/* This may be called often, so keep it lean */
551int set_precision_flag(int flags) 534int set_precision_flag(int flags)
552{ 535{
553 if ( control_word & CW_Precision ) 536 if (control_word & CW_Precision) {
554 { 537 partial_status &= ~(SW_C1 & flags);
555 partial_status &= ~(SW_C1 & flags); 538 partial_status |= flags; /* The masked response */
556 partial_status |= flags; /* The masked response */ 539 return 0;
557 return 0; 540 } else {
558 } 541 EXCEPTION(flags);
559 else 542 return 1;
560 { 543 }
561 EXCEPTION(flags);
562 return 1;
563 }
564} 544}
565 545
566
567/* This may be called often, so keep it lean */ 546/* This may be called often, so keep it lean */
568asmlinkage void set_precision_flag_up(void) 547asmlinkage void set_precision_flag_up(void)
569{ 548{
570 if ( control_word & CW_Precision ) 549 if (control_word & CW_Precision)
571 partial_status |= (SW_Precision | SW_C1); /* The masked response */ 550 partial_status |= (SW_Precision | SW_C1); /* The masked response */
572 else 551 else
573 EXCEPTION(EX_Precision | SW_C1); 552 EXCEPTION(EX_Precision | SW_C1);
574} 553}
575 554
576
577/* This may be called often, so keep it lean */ 555/* This may be called often, so keep it lean */
578asmlinkage void set_precision_flag_down(void) 556asmlinkage void set_precision_flag_down(void)
579{ 557{
580 if ( control_word & CW_Precision ) 558 if (control_word & CW_Precision) { /* The masked response */
581 { /* The masked response */ 559 partial_status &= ~SW_C1;
582 partial_status &= ~SW_C1; 560 partial_status |= SW_Precision;
583 partial_status |= SW_Precision; 561 } else
584 } 562 EXCEPTION(EX_Precision);
585 else
586 EXCEPTION(EX_Precision);
587} 563}
588 564
589
590asmlinkage int denormal_operand(void) 565asmlinkage int denormal_operand(void)
591{ 566{
592 if ( control_word & CW_Denormal ) 567 if (control_word & CW_Denormal) { /* The masked response */
593 { /* The masked response */ 568 partial_status |= SW_Denorm_Op;
594 partial_status |= SW_Denorm_Op; 569 return TAG_Special;
595 return TAG_Special; 570 } else {
596 } 571 EXCEPTION(EX_Denormal);
597 else 572 return TAG_Special | FPU_Exception;
598 { 573 }
599 EXCEPTION(EX_Denormal);
600 return TAG_Special | FPU_Exception;
601 }
602} 574}
603 575
604
605asmlinkage int arith_overflow(FPU_REG *dest) 576asmlinkage int arith_overflow(FPU_REG *dest)
606{ 577{
607 int tag = TAG_Valid; 578 int tag = TAG_Valid;
608 579
609 if ( control_word & CW_Overflow ) 580 if (control_word & CW_Overflow) {
610 { 581 /* The masked response */
611 /* The masked response */
612/* ###### The response here depends upon the rounding mode */ 582/* ###### The response here depends upon the rounding mode */
613 reg_copy(&CONST_INF, dest); 583 reg_copy(&CONST_INF, dest);
614 tag = TAG_Special; 584 tag = TAG_Special;
615 } 585 } else {
616 else 586 /* Subtract the magic number from the exponent */
617 { 587 addexponent(dest, (-3 * (1 << 13)));
618 /* Subtract the magic number from the exponent */ 588 }
619 addexponent(dest, (-3 * (1 << 13)));
620 }
621
622 EXCEPTION(EX_Overflow);
623 if ( control_word & CW_Overflow )
624 {
625 /* The overflow exception is masked. */
626 /* By definition, precision is lost.
627 The roundup bit (C1) is also set because we have
628 "rounded" upwards to Infinity. */
629 EXCEPTION(EX_Precision | SW_C1);
630 return tag;
631 }
632
633 return tag;
634 589
635} 590 EXCEPTION(EX_Overflow);
591 if (control_word & CW_Overflow) {
592 /* The overflow exception is masked. */
593 /* By definition, precision is lost.
594 The roundup bit (C1) is also set because we have
595 "rounded" upwards to Infinity. */
596 EXCEPTION(EX_Precision | SW_C1);
597 return tag;
598 }
599
600 return tag;
636 601
602}
637 603
638asmlinkage int arith_underflow(FPU_REG *dest) 604asmlinkage int arith_underflow(FPU_REG *dest)
639{ 605{
640 int tag = TAG_Valid; 606 int tag = TAG_Valid;
641 607
642 if ( control_word & CW_Underflow ) 608 if (control_word & CW_Underflow) {
643 { 609 /* The masked response */
644 /* The masked response */ 610 if (exponent16(dest) <= EXP_UNDER - 63) {
645 if ( exponent16(dest) <= EXP_UNDER - 63 ) 611 reg_copy(&CONST_Z, dest);
646 { 612 partial_status &= ~SW_C1; /* Round down. */
647 reg_copy(&CONST_Z, dest); 613 tag = TAG_Zero;
648 partial_status &= ~SW_C1; /* Round down. */ 614 } else {
649 tag = TAG_Zero; 615 stdexp(dest);
616 }
617 } else {
618 /* Add the magic number to the exponent. */
619 addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
650 } 620 }
651 else 621
652 { 622 EXCEPTION(EX_Underflow);
653 stdexp(dest); 623 if (control_word & CW_Underflow) {
624 /* The underflow exception is masked. */
625 EXCEPTION(EX_Precision);
626 return tag;
654 } 627 }
655 }
656 else
657 {
658 /* Add the magic number to the exponent. */
659 addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
660 }
661
662 EXCEPTION(EX_Underflow);
663 if ( control_word & CW_Underflow )
664 {
665 /* The underflow exception is masked. */
666 EXCEPTION(EX_Precision);
667 return tag;
668 }
669
670 return tag;
671 628
672} 629 return tag;
673 630
631}
674 632
675void FPU_stack_overflow(void) 633void FPU_stack_overflow(void)
676{ 634{
677 635
678 if ( control_word & CW_Invalid ) 636 if (control_word & CW_Invalid) {
679 { 637 /* The masked response */
680 /* The masked response */ 638 top--;
681 top--; 639 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
682 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 640 }
683 }
684 641
685 EXCEPTION(EX_StackOver); 642 EXCEPTION(EX_StackOver);
686 643
687 return; 644 return;
688 645
689} 646}
690 647
691
692void FPU_stack_underflow(void) 648void FPU_stack_underflow(void)
693{ 649{
694 650
695 if ( control_word & CW_Invalid ) 651 if (control_word & CW_Invalid) {
696 { 652 /* The masked response */
697 /* The masked response */ 653 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
698 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 654 }
699 }
700 655
701 EXCEPTION(EX_StackUnder); 656 EXCEPTION(EX_StackUnder);
702 657
703 return; 658 return;
704 659
705} 660}
706 661
707
708void FPU_stack_underflow_i(int i) 662void FPU_stack_underflow_i(int i)
709{ 663{
710 664
711 if ( control_word & CW_Invalid ) 665 if (control_word & CW_Invalid) {
712 { 666 /* The masked response */
713 /* The masked response */ 667 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
714 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); 668 }
715 }
716 669
717 EXCEPTION(EX_StackUnder); 670 EXCEPTION(EX_StackUnder);
718 671
719 return; 672 return;
720 673
721} 674}
722 675
723
724void FPU_stack_underflow_pop(int i) 676void FPU_stack_underflow_pop(int i)
725{ 677{
726 678
727 if ( control_word & CW_Invalid ) 679 if (control_word & CW_Invalid) {
728 { 680 /* The masked response */
729 /* The masked response */ 681 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
730 FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); 682 FPU_pop();
731 FPU_pop(); 683 }
732 }
733 684
734 EXCEPTION(EX_StackUnder); 685 EXCEPTION(EX_StackUnder);
735 686
736 return; 687 return;
737 688
738} 689}
739
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
index b463f21a811e..67f43a4683d5 100644
--- a/arch/x86/math-emu/exception.h
+++ b/arch/x86/math-emu/exception.h
@@ -9,7 +9,6 @@
9#ifndef _EXCEPTION_H_ 9#ifndef _EXCEPTION_H_
10#define _EXCEPTION_H_ 10#define _EXCEPTION_H_
11 11
12
13#ifdef __ASSEMBLY__ 12#ifdef __ASSEMBLY__
14#define Const_(x) $##x 13#define Const_(x) $##x
15#else 14#else
@@ -20,8 +19,8 @@
20#include "fpu_emu.h" 19#include "fpu_emu.h"
21#endif /* SW_C1 */ 20#endif /* SW_C1 */
22 21
23#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ 22#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */
24#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ 23#define EX_ErrorSummary Const_(0x0080) /* Error summary status */
25/* Special exceptions: */ 24/* Special exceptions: */
26#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */ 25#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */
27#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */ 26#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */
@@ -34,11 +33,9 @@
34#define EX_Denormal Const_(0x0002) /* denormalized operand */ 33#define EX_Denormal Const_(0x0002) /* denormalized operand */
35#define EX_Invalid Const_(0x0001) /* invalid operation */ 34#define EX_Invalid Const_(0x0001) /* invalid operation */
36 35
37
38#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) 36#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1))
39#define PRECISION_LOST_DOWN Const_(EX_Precision) 37#define PRECISION_LOST_DOWN Const_(EX_Precision)
40 38
41
42#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
43 40
44#ifdef DEBUG 41#ifdef DEBUG
@@ -48,6 +45,6 @@
48#define EXCEPTION(x) FPU_exception(x) 45#define EXCEPTION(x) FPU_exception(x)
49#endif 46#endif
50 47
51#endif /* __ASSEMBLY__ */ 48#endif /* __ASSEMBLY__ */
52 49
53#endif /* _EXCEPTION_H_ */ 50#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c
index 6972dec01af6..aeab24e083c4 100644
--- a/arch/x86/math-emu/fpu_arith.c
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -15,160 +15,138 @@
15#include "control_w.h" 15#include "control_w.h"
16#include "status_w.h" 16#include "status_w.h"
17 17
18
19void fadd__(void) 18void fadd__(void)
20{ 19{
21 /* fadd st,st(i) */ 20 /* fadd st,st(i) */
22 int i = FPU_rm; 21 int i = FPU_rm;
23 clear_C1(); 22 clear_C1();
24 FPU_add(&st(i), FPU_gettagi(i), 0, control_word); 23 FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
25} 24}
26 25
27
28void fmul__(void) 26void fmul__(void)
29{ 27{
30 /* fmul st,st(i) */ 28 /* fmul st,st(i) */
31 int i = FPU_rm; 29 int i = FPU_rm;
32 clear_C1(); 30 clear_C1();
33 FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); 31 FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
34} 32}
35 33
36
37
38void fsub__(void) 34void fsub__(void)
39{ 35{
40 /* fsub st,st(i) */ 36 /* fsub st,st(i) */
41 clear_C1(); 37 clear_C1();
42 FPU_sub(0, FPU_rm, control_word); 38 FPU_sub(0, FPU_rm, control_word);
43} 39}
44 40
45
46void fsubr_(void) 41void fsubr_(void)
47{ 42{
48 /* fsubr st,st(i) */ 43 /* fsubr st,st(i) */
49 clear_C1(); 44 clear_C1();
50 FPU_sub(REV, FPU_rm, control_word); 45 FPU_sub(REV, FPU_rm, control_word);
51} 46}
52 47
53
54void fdiv__(void) 48void fdiv__(void)
55{ 49{
56 /* fdiv st,st(i) */ 50 /* fdiv st,st(i) */
57 clear_C1(); 51 clear_C1();
58 FPU_div(0, FPU_rm, control_word); 52 FPU_div(0, FPU_rm, control_word);
59} 53}
60 54
61
62void fdivr_(void) 55void fdivr_(void)
63{ 56{
64 /* fdivr st,st(i) */ 57 /* fdivr st,st(i) */
65 clear_C1(); 58 clear_C1();
66 FPU_div(REV, FPU_rm, control_word); 59 FPU_div(REV, FPU_rm, control_word);
67} 60}
68 61
69
70
71void fadd_i(void) 62void fadd_i(void)
72{ 63{
73 /* fadd st(i),st */ 64 /* fadd st(i),st */
74 int i = FPU_rm; 65 int i = FPU_rm;
75 clear_C1(); 66 clear_C1();
76 FPU_add(&st(i), FPU_gettagi(i), i, control_word); 67 FPU_add(&st(i), FPU_gettagi(i), i, control_word);
77} 68}
78 69
79
80void fmul_i(void) 70void fmul_i(void)
81{ 71{
82 /* fmul st(i),st */ 72 /* fmul st(i),st */
83 clear_C1(); 73 clear_C1();
84 FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); 74 FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
85} 75}
86 76
87
88void fsubri(void) 77void fsubri(void)
89{ 78{
90 /* fsubr st(i),st */ 79 /* fsubr st(i),st */
91 clear_C1(); 80 clear_C1();
92 FPU_sub(DEST_RM, FPU_rm, control_word); 81 FPU_sub(DEST_RM, FPU_rm, control_word);
93} 82}
94 83
95
96void fsub_i(void) 84void fsub_i(void)
97{ 85{
98 /* fsub st(i),st */ 86 /* fsub st(i),st */
99 clear_C1(); 87 clear_C1();
100 FPU_sub(REV|DEST_RM, FPU_rm, control_word); 88 FPU_sub(REV | DEST_RM, FPU_rm, control_word);
101} 89}
102 90
103
104void fdivri(void) 91void fdivri(void)
105{ 92{
106 /* fdivr st(i),st */ 93 /* fdivr st(i),st */
107 clear_C1(); 94 clear_C1();
108 FPU_div(DEST_RM, FPU_rm, control_word); 95 FPU_div(DEST_RM, FPU_rm, control_word);
109} 96}
110 97
111
112void fdiv_i(void) 98void fdiv_i(void)
113{ 99{
114 /* fdiv st(i),st */ 100 /* fdiv st(i),st */
115 clear_C1(); 101 clear_C1();
116 FPU_div(REV|DEST_RM, FPU_rm, control_word); 102 FPU_div(REV | DEST_RM, FPU_rm, control_word);
117} 103}
118 104
119
120
121void faddp_(void) 105void faddp_(void)
122{ 106{
123 /* faddp st(i),st */ 107 /* faddp st(i),st */
124 int i = FPU_rm; 108 int i = FPU_rm;
125 clear_C1(); 109 clear_C1();
126 if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 ) 110 if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0)
127 FPU_pop(); 111 FPU_pop();
128} 112}
129 113
130
131void fmulp_(void) 114void fmulp_(void)
132{ 115{
133 /* fmulp st(i),st */ 116 /* fmulp st(i),st */
134 clear_C1(); 117 clear_C1();
135 if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 ) 118 if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0)
136 FPU_pop(); 119 FPU_pop();
137} 120}
138 121
139
140
141void fsubrp(void) 122void fsubrp(void)
142{ 123{
143 /* fsubrp st(i),st */ 124 /* fsubrp st(i),st */
144 clear_C1(); 125 clear_C1();
145 if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 ) 126 if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0)
146 FPU_pop(); 127 FPU_pop();
147} 128}
148 129
149
150void fsubp_(void) 130void fsubp_(void)
151{ 131{
152 /* fsubp st(i),st */ 132 /* fsubp st(i),st */
153 clear_C1(); 133 clear_C1();
154 if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 ) 134 if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0)
155 FPU_pop(); 135 FPU_pop();
156} 136}
157 137
158
159void fdivrp(void) 138void fdivrp(void)
160{ 139{
161 /* fdivrp st(i),st */ 140 /* fdivrp st(i),st */
162 clear_C1(); 141 clear_C1();
163 if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 ) 142 if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0)
164 FPU_pop(); 143 FPU_pop();
165} 144}
166 145
167
168void fdivp_(void) 146void fdivp_(void)
169{ 147{
170 /* fdivp st(i),st */ 148 /* fdivp st(i),st */
171 clear_C1(); 149 clear_C1();
172 if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 ) 150 if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0)
173 FPU_pop(); 151 FPU_pop();
174} 152}
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h
index 9ba12416df12..955b932735a4 100644
--- a/arch/x86/math-emu/fpu_asm.h
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -14,7 +14,6 @@
14 14
15#define EXCEPTION FPU_exception 15#define EXCEPTION FPU_exception
16 16
17
18#define PARAM1 8(%ebp) 17#define PARAM1 8(%ebp)
19#define PARAM2 12(%ebp) 18#define PARAM2 12(%ebp)
20#define PARAM3 16(%ebp) 19#define PARAM3 16(%ebp)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 20886cfb9f76..491e737ce547 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -16,34 +16,34 @@
16#include "status_w.h" 16#include "status_w.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19
20static void fnop(void) 19static void fnop(void)
21{ 20{
22} 21}
23 22
24static void fclex(void) 23static void fclex(void)
25{ 24{
26 partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision| 25 partial_status &=
27 SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op| 26 ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision |
28 SW_Invalid); 27 SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op |
29 no_ip_update = 1; 28 SW_Invalid);
29 no_ip_update = 1;
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit(void)
34{ 34{
35 control_word = 0x037f; 35 control_word = 0x037f;
36 partial_status = 0; 36 partial_status = 0;
37 top = 0; /* We don't keep top in the status word internally. */ 37 top = 0; /* We don't keep top in the status word internally. */
38 fpu_tag_word = 0xffff; 38 fpu_tag_word = 0xffff;
39 /* The behaviour is different from that detailed in 39 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 40 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 41 operand_address.offset = 0;
42 operand_address.selector = 0; 42 operand_address.selector = 0;
43 instruction_address.offset = 0; 43 instruction_address.offset = 0;
44 instruction_address.selector = 0; 44 instruction_address.selector = 0;
45 instruction_address.opcode = 0; 45 instruction_address.opcode = 0;
46 no_ip_update = 1; 46 no_ip_update = 1;
47} 47}
48 48
49/* 49/*
@@ -54,151 +54,134 @@ void finit(void)
54#define fsetpm fnop 54#define fsetpm fnop
55 55
56static FUNC const finit_table[] = { 56static FUNC const finit_table[] = {
57 feni, fdisi, fclex, finit, 57 feni, fdisi, fclex, finit,
58 fsetpm, FPU_illegal, FPU_illegal, FPU_illegal 58 fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
59}; 59};
60 60
61void finit_(void) 61void finit_(void)
62{ 62{
63 (finit_table[FPU_rm])(); 63 (finit_table[FPU_rm]) ();
64} 64}
65 65
66
67static void fstsw_ax(void) 66static void fstsw_ax(void)
68{ 67{
69 *(short *) &FPU_EAX = status_word(); 68 *(short *)&FPU_EAX = status_word();
70 no_ip_update = 1; 69 no_ip_update = 1;
71} 70}
72 71
73static FUNC const fstsw_table[] = { 72static FUNC const fstsw_table[] = {
74 fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, 73 fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
75 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal 74 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
76}; 75};
77 76
78void fstsw_(void) 77void fstsw_(void)
79{ 78{
80 (fstsw_table[FPU_rm])(); 79 (fstsw_table[FPU_rm]) ();
81} 80}
82 81
83
84static FUNC const fp_nop_table[] = { 82static FUNC const fp_nop_table[] = {
85 fnop, FPU_illegal, FPU_illegal, FPU_illegal, 83 fnop, FPU_illegal, FPU_illegal, FPU_illegal,
86 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal 84 FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
87}; 85};
88 86
89void fp_nop(void) 87void fp_nop(void)
90{ 88{
91 (fp_nop_table[FPU_rm])(); 89 (fp_nop_table[FPU_rm]) ();
92} 90}
93 91
94
95void fld_i_(void) 92void fld_i_(void)
96{ 93{
97 FPU_REG *st_new_ptr; 94 FPU_REG *st_new_ptr;
98 int i; 95 int i;
99 u_char tag; 96 u_char tag;
100 97
101 if ( STACK_OVERFLOW ) 98 if (STACK_OVERFLOW) {
102 { FPU_stack_overflow(); return; } 99 FPU_stack_overflow();
103 100 return;
104 /* fld st(i) */
105 i = FPU_rm;
106 if ( NOT_EMPTY(i) )
107 {
108 reg_copy(&st(i), st_new_ptr);
109 tag = FPU_gettagi(i);
110 push();
111 FPU_settag0(tag);
112 }
113 else
114 {
115 if ( control_word & CW_Invalid )
116 {
117 /* The masked response */
118 FPU_stack_underflow();
119 } 101 }
120 else
121 EXCEPTION(EX_StackUnder);
122 }
123 102
124} 103 /* fld st(i) */
104 i = FPU_rm;
105 if (NOT_EMPTY(i)) {
106 reg_copy(&st(i), st_new_ptr);
107 tag = FPU_gettagi(i);
108 push();
109 FPU_settag0(tag);
110 } else {
111 if (control_word & CW_Invalid) {
112 /* The masked response */
113 FPU_stack_underflow();
114 } else
115 EXCEPTION(EX_StackUnder);
116 }
125 117
118}
126 119
127void fxch_i(void) 120void fxch_i(void)
128{ 121{
129 /* fxch st(i) */ 122 /* fxch st(i) */
130 FPU_REG t; 123 FPU_REG t;
131 int i = FPU_rm; 124 int i = FPU_rm;
132 FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); 125 FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
133 long tag_word = fpu_tag_word; 126 long tag_word = fpu_tag_word;
134 int regnr = top & 7, regnri = ((regnr + i) & 7); 127 int regnr = top & 7, regnri = ((regnr + i) & 7);
135 u_char st0_tag = (tag_word >> (regnr*2)) & 3; 128 u_char st0_tag = (tag_word >> (regnr * 2)) & 3;
136 u_char sti_tag = (tag_word >> (regnri*2)) & 3; 129 u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
137 130
138 if ( st0_tag == TAG_Empty ) 131 if (st0_tag == TAG_Empty) {
139 { 132 if (sti_tag == TAG_Empty) {
140 if ( sti_tag == TAG_Empty ) 133 FPU_stack_underflow();
141 { 134 FPU_stack_underflow_i(i);
142 FPU_stack_underflow(); 135 return;
143 FPU_stack_underflow_i(i); 136 }
144 return; 137 if (control_word & CW_Invalid) {
138 /* Masked response */
139 FPU_copy_to_reg0(sti_ptr, sti_tag);
140 }
141 FPU_stack_underflow_i(i);
142 return;
145 } 143 }
146 if ( control_word & CW_Invalid ) 144 if (sti_tag == TAG_Empty) {
147 { 145 if (control_word & CW_Invalid) {
148 /* Masked response */ 146 /* Masked response */
149 FPU_copy_to_reg0(sti_ptr, sti_tag); 147 FPU_copy_to_regi(st0_ptr, st0_tag, i);
148 }
149 FPU_stack_underflow();
150 return;
150 } 151 }
151 FPU_stack_underflow_i(i); 152 clear_C1();
152 return;
153 }
154 if ( sti_tag == TAG_Empty )
155 {
156 if ( control_word & CW_Invalid )
157 {
158 /* Masked response */
159 FPU_copy_to_regi(st0_ptr, st0_tag, i);
160 }
161 FPU_stack_underflow();
162 return;
163 }
164 clear_C1();
165
166 reg_copy(st0_ptr, &t);
167 reg_copy(sti_ptr, st0_ptr);
168 reg_copy(&t, sti_ptr);
169
170 tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2));
171 tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2));
172 fpu_tag_word = tag_word;
173}
174 153
154 reg_copy(st0_ptr, &t);
155 reg_copy(sti_ptr, st0_ptr);
156 reg_copy(&t, sti_ptr);
157
158 tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2));
159 tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2));
160 fpu_tag_word = tag_word;
161}
175 162
176void ffree_(void) 163void ffree_(void)
177{ 164{
178 /* ffree st(i) */ 165 /* ffree st(i) */
179 FPU_settagi(FPU_rm, TAG_Empty); 166 FPU_settagi(FPU_rm, TAG_Empty);
180} 167}
181 168
182
183void ffreep(void) 169void ffreep(void)
184{ 170{
185 /* ffree st(i) + pop - unofficial code */ 171 /* ffree st(i) + pop - unofficial code */
186 FPU_settagi(FPU_rm, TAG_Empty); 172 FPU_settagi(FPU_rm, TAG_Empty);
187 FPU_pop(); 173 FPU_pop();
188} 174}
189 175
190
191void fst_i_(void) 176void fst_i_(void)
192{ 177{
193 /* fst st(i) */ 178 /* fst st(i) */
194 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); 179 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
195} 180}
196 181
197
198void fstp_i(void) 182void fstp_i(void)
199{ 183{
200 /* fstp st(i) */ 184 /* fstp st(i) */
201 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); 185 FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
202 FPU_pop(); 186 FPU_pop();
203} 187}
204
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index 65120f523853..4dae511c85ad 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -7,7 +7,6 @@
7 | | 7 | |
8 +---------------------------------------------------------------------------*/ 8 +---------------------------------------------------------------------------*/
9 9
10
11#ifndef _FPU_EMU_H_ 10#ifndef _FPU_EMU_H_
12#define _FPU_EMU_H_ 11#define _FPU_EMU_H_
13 12
@@ -28,15 +27,15 @@
28#endif 27#endif
29 28
30#define EXP_BIAS Const(0) 29#define EXP_BIAS Const(0)
31#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ 30#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */
32#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ 31#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */
33#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but 32#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but
34 still a 16 bit nr. */ 33 still a 16 bit nr. */
35#define EXP_Infinity EXP_OVER 34#define EXP_Infinity EXP_OVER
36#define EXP_NaN EXP_OVER 35#define EXP_NaN EXP_OVER
37 36
38#define EXTENDED_Ebias Const(0x3fff) 37#define EXTENDED_Ebias Const(0x3fff)
39#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ 38#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */
40 39
41#define SIGN_POS Const(0) 40#define SIGN_POS Const(0)
42#define SIGN_NEG Const(0x80) 41#define SIGN_NEG Const(0x80)
@@ -44,10 +43,9 @@
44#define SIGN_Positive Const(0) 43#define SIGN_Positive Const(0)
45#define SIGN_Negative Const(0x8000) 44#define SIGN_Negative Const(0x8000)
46 45
47
48/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */ 46/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
49/* The following fold to 2 (Special) in the Tag Word */ 47/* The following fold to 2 (Special) in the Tag Word */
50#define TW_Denormal Const(4) /* De-normal */ 48#define TW_Denormal Const(4) /* De-normal */
51#define TW_Infinity Const(5) /* + or - infinity */ 49#define TW_Infinity Const(5) /* + or - infinity */
52#define TW_NaN Const(6) /* Not a Number */ 50#define TW_NaN Const(6) /* Not a Number */
53#define TW_Unsupported Const(7) /* Not supported by an 80486 */ 51#define TW_Unsupported Const(7) /* Not supported by an 80486 */
@@ -67,14 +65,13 @@
67#define DEST_RM 0x20 65#define DEST_RM 0x20
68#define LOADED 0x40 66#define LOADED 0x40
69 67
70#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ 68#define FPU_Exception Const(0x80000000) /* Added to tag returns. */
71
72 69
73#ifndef __ASSEMBLY__ 70#ifndef __ASSEMBLY__
74 71
75#include "fpu_system.h" 72#include "fpu_system.h"
76 73
77#include <asm/sigcontext.h> /* for struct _fpstate */ 74#include <asm/sigcontext.h> /* for struct _fpstate */
78#include <asm/math_emu.h> 75#include <asm/math_emu.h>
79#include <linux/linkage.h> 76#include <linux/linkage.h>
80 77
@@ -112,30 +109,33 @@ extern u_char emulating;
112#define PREFIX_DEFAULT 7 109#define PREFIX_DEFAULT 7
113 110
114struct address { 111struct address {
115 unsigned int offset; 112 unsigned int offset;
116 unsigned int selector:16; 113 unsigned int selector:16;
117 unsigned int opcode:11; 114 unsigned int opcode:11;
118 unsigned int empty:5; 115 unsigned int empty:5;
119}; 116};
120struct fpu__reg { 117struct fpu__reg {
121 unsigned sigl; 118 unsigned sigl;
122 unsigned sigh; 119 unsigned sigh;
123 short exp; 120 short exp;
124}; 121};
125 122
126typedef void (*FUNC)(void); 123typedef void (*FUNC) (void);
127typedef struct fpu__reg FPU_REG; 124typedef struct fpu__reg FPU_REG;
128typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag); 125typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag);
129typedef struct { u_char address_size, operand_size, segment; } 126typedef struct {
130 overrides; 127 u_char address_size, operand_size, segment;
128} overrides;
131/* This structure is 32 bits: */ 129/* This structure is 32 bits: */
132typedef struct { overrides override; 130typedef struct {
133 u_char default_mode; } fpu_addr_modes; 131 overrides override;
132 u_char default_mode;
133} fpu_addr_modes;
134/* PROTECTED has a restricted meaning in the emulator; it is used 134/* PROTECTED has a restricted meaning in the emulator; it is used
135 to signal that the emulator needs to do special things to ensure 135 to signal that the emulator needs to do special things to ensure
136 that protection is respected in a segmented model. */ 136 that protection is respected in a segmented model. */
137#define PROTECTED 4 137#define PROTECTED 4
138#define SIXTEEN 1 /* We rely upon this being 1 (true) */ 138#define SIXTEEN 1 /* We rely upon this being 1 (true) */
139#define VM86 SIXTEEN 139#define VM86 SIXTEEN
140#define PM16 (SIXTEEN | PROTECTED) 140#define PM16 (SIXTEEN | PROTECTED)
141#define SEG32 PROTECTED 141#define SEG32 PROTECTED
@@ -168,8 +168,8 @@ extern u_char const data_sizes_16[32];
168 168
169static inline void reg_copy(FPU_REG const *x, FPU_REG *y) 169static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
170{ 170{
171 *(short *)&(y->exp) = *(const short *)&(x->exp); 171 *(short *)&(y->exp) = *(const short *)&(x->exp);
172 *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); 172 *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
173} 173}
174 174
175#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias) 175#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
@@ -184,27 +184,26 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
184 184
185#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] ) 185#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
186 186
187
188/*----- Prototypes for functions written in assembler -----*/ 187/*----- Prototypes for functions written in assembler -----*/
189/* extern void reg_move(FPU_REG *a, FPU_REG *b); */ 188/* extern void reg_move(FPU_REG *a, FPU_REG *b); */
190 189
191asmlinkage int FPU_normalize(FPU_REG *x); 190asmlinkage int FPU_normalize(FPU_REG *x);
192asmlinkage int FPU_normalize_nuo(FPU_REG *x); 191asmlinkage int FPU_normalize_nuo(FPU_REG *x);
193asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2, 192asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
194 FPU_REG *answ, unsigned int control_w, u_char sign, 193 FPU_REG * answ, unsigned int control_w, u_char sign,
195 int expa, int expb); 194 int expa, int expb);
196asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2, 195asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
197 FPU_REG *answ, unsigned int control_w, u_char sign, 196 FPU_REG * answ, unsigned int control_w, u_char sign,
198 int expon); 197 int expon);
199asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2, 198asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
200 FPU_REG *answ, unsigned int control_w, u_char sign); 199 FPU_REG * answ, unsigned int control_w, u_char sign);
201asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2, 200asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
202 FPU_REG *answ, unsigned int control_w, u_char sign, 201 FPU_REG * answ, unsigned int control_w, u_char sign,
203 int expa, int expb); 202 int expa, int expb);
204asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2, 203asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
205 unsigned int control_w, u_char sign); 204 unsigned int control_w, u_char sign);
206asmlinkage unsigned FPU_shrx(void *l, unsigned x); 205asmlinkage unsigned FPU_shrx(void *l, unsigned x);
207asmlinkage unsigned FPU_shrxs(void *v, unsigned x); 206asmlinkage unsigned FPU_shrxs(void *v, unsigned x);
208asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y); 207asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
209asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, 208asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
210 unsigned int control_w, u_char sign); 209 unsigned int control_w, u_char sign);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 1853524c8b57..760baeea5f07 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -25,10 +25,11 @@
25 +---------------------------------------------------------------------------*/ 25 +---------------------------------------------------------------------------*/
26 26
27#include <linux/signal.h> 27#include <linux/signal.h>
28#include <linux/ptrace.h> 28#include <linux/regset.h>
29 29
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32#include <asm/user.h>
32 33
33#include "fpu_system.h" 34#include "fpu_system.h"
34#include "fpu_emu.h" 35#include "fpu_emu.h"
@@ -36,726 +37,727 @@
36#include "control_w.h" 37#include "control_w.h"
37#include "status_w.h" 38#include "status_w.h"
38 39
39#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ 40#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */
40 41
41#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ 42#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */
42 43
43/* WARNING: These codes are not documented by Intel in their 80486 manual 44/* WARNING: These codes are not documented by Intel in their 80486 manual
44 and may not work on FPU clones or later Intel FPUs. */ 45 and may not work on FPU clones or later Intel FPUs. */
45 46
46/* Changes to support the un-doc codes provided by Linus Torvalds. */ 47/* Changes to support the un-doc codes provided by Linus Torvalds. */
47 48
48#define _d9_d8_ fstp_i /* unofficial code (19) */ 49#define _d9_d8_ fstp_i /* unofficial code (19) */
49#define _dc_d0_ fcom_st /* unofficial code (14) */ 50#define _dc_d0_ fcom_st /* unofficial code (14) */
50#define _dc_d8_ fcompst /* unofficial code (1c) */ 51#define _dc_d8_ fcompst /* unofficial code (1c) */
51#define _dd_c8_ fxch_i /* unofficial code (0d) */ 52#define _dd_c8_ fxch_i /* unofficial code (0d) */
52#define _de_d0_ fcompst /* unofficial code (16) */ 53#define _de_d0_ fcompst /* unofficial code (16) */
53#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ 54#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */
54#define _df_c8_ fxch_i /* unofficial code (0f) */ 55#define _df_c8_ fxch_i /* unofficial code (0f) */
55#define _df_d0_ fstp_i /* unofficial code (17) */ 56#define _df_d0_ fstp_i /* unofficial code (17) */
56#define _df_d8_ fstp_i /* unofficial code (1f) */ 57#define _df_d8_ fstp_i /* unofficial code (1f) */
57 58
58static FUNC const st_instr_table[64] = { 59static FUNC const st_instr_table[64] = {
59 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, 60 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
60 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, 61 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
61 fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, 62 fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
62 fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, 63 fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
63 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, 64 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
64 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, 65 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
65 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, 66 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
66 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, 67 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
67}; 68};
68 69
69#else /* Support only documented FPU op-codes */ 70#else /* Support only documented FPU op-codes */
70 71
71static FUNC const st_instr_table[64] = { 72static FUNC const st_instr_table[64] = {
72 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, 73 fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
73 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, 74 fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
74 fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, 75 fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
75 fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, 76 fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
76 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, 77 fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
77 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, 78 fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
78 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, 79 fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
79 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, 80 fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
80}; 81};
81 82
82#endif /* NO_UNDOC_CODE */ 83#endif /* NO_UNDOC_CODE */
83 84
84 85#define _NONE_ 0 /* Take no special action */
85#define _NONE_ 0 /* Take no special action */ 86#define _REG0_ 1 /* Need to check for not empty st(0) */
86#define _REG0_ 1 /* Need to check for not empty st(0) */ 87#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */
87#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ 88#define _REGi_ 0 /* Uses st(rm) */
88#define _REGi_ 0 /* Uses st(rm) */ 89#define _PUSH_ 3 /* Need to check for space to push onto stack */
89#define _PUSH_ 3 /* Need to check for space to push onto stack */ 90#define _null_ 4 /* Function illegal or not implemented */
90#define _null_ 4 /* Function illegal or not implemented */ 91#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */
91#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ 92#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */
92#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ 93#define _REGIc 0 /* Compare st(0) and st(rm) */
93#define _REGIc 0 /* Compare st(0) and st(rm) */ 94#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
94#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
95 95
96#ifndef NO_UNDOC_CODE 96#ifndef NO_UNDOC_CODE
97 97
98/* Un-documented FPU op-codes supported by default. (see above) */ 98/* Un-documented FPU op-codes supported by default. (see above) */
99 99
100static u_char const type_table[64] = { 100static u_char const type_table[64] = {
101 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, 101 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
102 _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, 102 _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
103 _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, 103 _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
104 _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, 104 _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
105 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, 105 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
106 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, 106 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
107 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 107 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
108 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ 108 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
109}; 109};
110 110
111#else /* Support only documented FPU op-codes */ 111#else /* Support only documented FPU op-codes */
112 112
113static u_char const type_table[64] = { 113static u_char const type_table[64] = {
114 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, 114 _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
115 _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 115 _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
116 _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, 116 _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
117 _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, 117 _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
118 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, 118 _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
119 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, 119 _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
120 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, 120 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
121 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ 121 _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
122}; 122};
123 123
124#endif /* NO_UNDOC_CODE */ 124#endif /* NO_UNDOC_CODE */
125 125
126
127#ifdef RE_ENTRANT_CHECKING 126#ifdef RE_ENTRANT_CHECKING
128u_char emulating=0; 127u_char emulating = 0;
129#endif /* RE_ENTRANT_CHECKING */ 128#endif /* RE_ENTRANT_CHECKING */
130 129
131static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, 130static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip,
132 overrides *override); 131 overrides * override);
133 132
134asmlinkage void math_emulate(long arg) 133asmlinkage void math_emulate(long arg)
135{ 134{
136 u_char FPU_modrm, byte1; 135 u_char FPU_modrm, byte1;
137 unsigned short code; 136 unsigned short code;
138 fpu_addr_modes addr_modes; 137 fpu_addr_modes addr_modes;
139 int unmasked; 138 int unmasked;
140 FPU_REG loaded_data; 139 FPU_REG loaded_data;
141 FPU_REG *st0_ptr; 140 FPU_REG *st0_ptr;
142 u_char loaded_tag, st0_tag; 141 u_char loaded_tag, st0_tag;
143 void __user *data_address; 142 void __user *data_address;
144 struct address data_sel_off; 143 struct address data_sel_off;
145 struct address entry_sel_off; 144 struct address entry_sel_off;
146 unsigned long code_base = 0; 145 unsigned long code_base = 0;
147 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ 146 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
148 struct desc_struct code_descriptor; 147 struct desc_struct code_descriptor;
149 148
150#ifdef RE_ENTRANT_CHECKING 149#ifdef RE_ENTRANT_CHECKING
151 if ( emulating ) 150 if (emulating) {
152 { 151 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
153 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); 152 }
154 } 153 RE_ENTRANT_CHECK_ON;
155 RE_ENTRANT_CHECK_ON;
156#endif /* RE_ENTRANT_CHECKING */ 154#endif /* RE_ENTRANT_CHECKING */
157 155
158 if (!used_math()) 156 if (!used_math()) {
159 { 157 finit();
160 finit(); 158 set_used_math();
161 set_used_math();
162 }
163
164 SETUP_DATA_AREA(arg);
165
166 FPU_ORIG_EIP = FPU_EIP;
167
168 if ( (FPU_EFLAGS & 0x00020000) != 0 )
169 {
170 /* Virtual 8086 mode */
171 addr_modes.default_mode = VM86;
172 FPU_EIP += code_base = FPU_CS << 4;
173 code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
174 }
175 else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS )
176 {
177 addr_modes.default_mode = 0;
178 }
179 else if ( FPU_CS == __KERNEL_CS )
180 {
181 printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP);
182 panic("Math emulation needed in kernel");
183 }
184 else
185 {
186
187 if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */
188 {
189 /* Can only handle segmented addressing via the LDT
190 for now, and it must be 16 bit */
191 printk("FPU emulator: Unsupported addressing mode\n");
192 math_abort(FPU_info, SIGILL);
193 } 159 }
194 160
195 code_descriptor = LDT_DESCRIPTOR(FPU_CS); 161 SETUP_DATA_AREA(arg);
196 if ( SEG_D_SIZE(code_descriptor) ) 162
197 { 163 FPU_ORIG_EIP = FPU_EIP;
198 /* The above test may be wrong, the book is not clear */ 164
199 /* Segmented 32 bit protected mode */ 165 if ((FPU_EFLAGS & 0x00020000) != 0) {
200 addr_modes.default_mode = SEG32; 166 /* Virtual 8086 mode */
167 addr_modes.default_mode = VM86;
168 FPU_EIP += code_base = FPU_CS << 4;
169 code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
170 } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) {
171 addr_modes.default_mode = 0;
172 } else if (FPU_CS == __KERNEL_CS) {
173 printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP);
174 panic("Math emulation needed in kernel");
175 } else {
176
177 if ((FPU_CS & 4) != 4) { /* Must be in the LDT */
178 /* Can only handle segmented addressing via the LDT
179 for now, and it must be 16 bit */
180 printk("FPU emulator: Unsupported addressing mode\n");
181 math_abort(FPU_info, SIGILL);
182 }
183
184 code_descriptor = LDT_DESCRIPTOR(FPU_CS);
185 if (SEG_D_SIZE(code_descriptor)) {
186 /* The above test may be wrong, the book is not clear */
187 /* Segmented 32 bit protected mode */
188 addr_modes.default_mode = SEG32;
189 } else {
190 /* 16 bit protected mode */
191 addr_modes.default_mode = PM16;
192 }
193 FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
194 code_limit = code_base
195 + (SEG_LIMIT(code_descriptor) +
196 1) * SEG_GRANULARITY(code_descriptor)
197 - 1;
198 if (code_limit < code_base)
199 code_limit = 0xffffffff;
201 } 200 }
202 else 201
203 { 202 FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF);
204 /* 16 bit protected mode */ 203
205 addr_modes.default_mode = PM16; 204 if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
205 &addr_modes.override)) {
206 RE_ENTRANT_CHECK_OFF;
207 printk
208 ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
209 "FPU emulator: self-modifying code! (emulation impossible)\n",
210 byte1);
211 RE_ENTRANT_CHECK_ON;
212 EXCEPTION(EX_INTERNAL | 0x126);
213 math_abort(FPU_info, SIGILL);
206 } 214 }
207 FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); 215
208 code_limit = code_base 216 do_another_FPU_instruction:
209 + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor) 217
210 - 1; 218 no_ip_update = 0;
211 if ( code_limit < code_base ) code_limit = 0xffffffff; 219
212 } 220 FPU_EIP++; /* We have fetched the prefix and first code bytes. */
213 221
214 FPU_lookahead = 1; 222 if (addr_modes.default_mode) {
215 if (current->ptrace & PT_PTRACED) 223 /* This checks for the minimum instruction bytes.
216 FPU_lookahead = 0; 224 We also need to check any extra (address mode) code access. */
217 225 if (FPU_EIP > code_limit)
218 if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP, 226 math_abort(FPU_info, SIGSEGV);
219 &addr_modes.override) )
220 {
221 RE_ENTRANT_CHECK_OFF;
222 printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
223 "FPU emulator: self-modifying code! (emulation impossible)\n",
224 byte1);
225 RE_ENTRANT_CHECK_ON;
226 EXCEPTION(EX_INTERNAL|0x126);
227 math_abort(FPU_info,SIGILL);
228 }
229
230do_another_FPU_instruction:
231
232 no_ip_update = 0;
233
234 FPU_EIP++; /* We have fetched the prefix and first code bytes. */
235
236 if ( addr_modes.default_mode )
237 {
238 /* This checks for the minimum instruction bytes.
239 We also need to check any extra (address mode) code access. */
240 if ( FPU_EIP > code_limit )
241 math_abort(FPU_info,SIGSEGV);
242 }
243
244 if ( (byte1 & 0xf8) != 0xd8 )
245 {
246 if ( byte1 == FWAIT_OPCODE )
247 {
248 if (partial_status & SW_Summary)
249 goto do_the_FPU_interrupt;
250 else
251 goto FPU_fwait_done;
252 } 227 }
228
229 if ((byte1 & 0xf8) != 0xd8) {
230 if (byte1 == FWAIT_OPCODE) {
231 if (partial_status & SW_Summary)
232 goto do_the_FPU_interrupt;
233 else
234 goto FPU_fwait_done;
235 }
253#ifdef PARANOID 236#ifdef PARANOID
254 EXCEPTION(EX_INTERNAL|0x128); 237 EXCEPTION(EX_INTERNAL | 0x128);
255 math_abort(FPU_info,SIGILL); 238 math_abort(FPU_info, SIGILL);
256#endif /* PARANOID */ 239#endif /* PARANOID */
257 }
258
259 RE_ENTRANT_CHECK_OFF;
260 FPU_code_access_ok(1);
261 FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
262 RE_ENTRANT_CHECK_ON;
263 FPU_EIP++;
264
265 if (partial_status & SW_Summary)
266 {
267 /* Ignore the error for now if the current instruction is a no-wait
268 control instruction */
269 /* The 80486 manual contradicts itself on this topic,
270 but a real 80486 uses the following instructions:
271 fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
272 */
273 code = (FPU_modrm << 8) | byte1;
274 if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
275 (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
276 fnstsw */
277 ((code & 0xc000) != 0xc000))) ) )
278 {
279 /*
280 * We need to simulate the action of the kernel to FPU
281 * interrupts here.
282 */
283 do_the_FPU_interrupt:
284
285 FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
286
287 RE_ENTRANT_CHECK_OFF;
288 current->thread.trap_no = 16;
289 current->thread.error_code = 0;
290 send_sig(SIGFPE, current, 1);
291 return;
292 }
293 }
294
295 entry_sel_off.offset = FPU_ORIG_EIP;
296 entry_sel_off.selector = FPU_CS;
297 entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
298
299 FPU_rm = FPU_modrm & 7;
300
301 if ( FPU_modrm < 0300 )
302 {
303 /* All of these instructions use the mod/rm byte to get a data address */
304
305 if ( (addr_modes.default_mode & SIXTEEN)
306 ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) )
307 data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off,
308 addr_modes);
309 else
310 data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
311 addr_modes);
312
313 if ( addr_modes.default_mode )
314 {
315 if ( FPU_EIP-1 > code_limit )
316 math_abort(FPU_info,SIGSEGV);
317 } 240 }
318 241
319 if ( !(byte1 & 1) ) 242 RE_ENTRANT_CHECK_OFF;
320 { 243 FPU_code_access_ok(1);
321 unsigned short status1 = partial_status; 244 FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
322 245 RE_ENTRANT_CHECK_ON;
323 st0_ptr = &st(0); 246 FPU_EIP++;
324 st0_tag = FPU_gettag0(); 247
325 248 if (partial_status & SW_Summary) {
326 /* Stack underflow has priority */ 249 /* Ignore the error for now if the current instruction is a no-wait
327 if ( NOT_EMPTY_ST0 ) 250 control instruction */
328 { 251 /* The 80486 manual contradicts itself on this topic,
329 if ( addr_modes.default_mode & PROTECTED ) 252 but a real 80486 uses the following instructions:
330 { 253 fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
331 /* This table works for 16 and 32 bit protected mode */ 254 */
332 if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] ) 255 code = (FPU_modrm << 8) | byte1;
333 math_abort(FPU_info,SIGSEGV); 256 if (!((((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
257 (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
258 fnstsw */
259 ((code & 0xc000) != 0xc000))))) {
260 /*
261 * We need to simulate the action of the kernel to FPU
262 * interrupts here.
263 */
264 do_the_FPU_interrupt:
265
266 FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
267
268 RE_ENTRANT_CHECK_OFF;
269 current->thread.trap_no = 16;
270 current->thread.error_code = 0;
271 send_sig(SIGFPE, current, 1);
272 return;
334 } 273 }
274 }
335 275
336 unmasked = 0; /* Do this here to stop compiler warnings. */ 276 entry_sel_off.offset = FPU_ORIG_EIP;
337 switch ( (byte1 >> 1) & 3 ) 277 entry_sel_off.selector = FPU_CS;
338 { 278 entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
339 case 0:
340 unmasked = FPU_load_single((float __user *)data_address,
341 &loaded_data);
342 loaded_tag = unmasked & 0xff;
343 unmasked &= ~0xff;
344 break;
345 case 1:
346 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
347 break;
348 case 2:
349 unmasked = FPU_load_double((double __user *)data_address,
350 &loaded_data);
351 loaded_tag = unmasked & 0xff;
352 unmasked &= ~0xff;
353 break;
354 case 3:
355 default: /* Used here to suppress gcc warnings. */
356 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
357 break;
358 }
359 279
360 /* No more access to user memory, it is safe 280 FPU_rm = FPU_modrm & 7;
361 to use static data now */
362
363 /* NaN operands have the next priority. */
364 /* We have to delay looking at st(0) until after
365 loading the data, because that data might contain an SNaN */
366 if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) ||
367 ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) )
368 {
369 /* Restore the status word; we might have loaded a
370 denormal. */
371 partial_status = status1;
372 if ( (FPU_modrm & 0x30) == 0x10 )
373 {
374 /* fcom or fcomp */
375 EXCEPTION(EX_Invalid);
376 setcc(SW_C3 | SW_C2 | SW_C0);
377 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
378 FPU_pop(); /* fcomp, masked, so we pop. */
379 }
380 else
381 {
382 if ( loaded_tag == TAG_Special )
383 loaded_tag = FPU_Special(&loaded_data);
384#ifdef PECULIAR_486
385 /* This is not really needed, but gives behaviour
386 identical to an 80486 */
387 if ( (FPU_modrm & 0x28) == 0x20 )
388 /* fdiv or fsub */
389 real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data);
390 else
391#endif /* PECULIAR_486 */
392 /* fadd, fdivr, fmul, or fsubr */
393 real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr);
394 }
395 goto reg_mem_instr_done;
396 }
397 281
398 if ( unmasked && !((FPU_modrm & 0x30) == 0x10) ) 282 if (FPU_modrm < 0300) {
399 { 283 /* All of these instructions use the mod/rm byte to get a data address */
400 /* Is not a comparison instruction. */
401 if ( (FPU_modrm & 0x38) == 0x38 )
402 {
403 /* fdivr */
404 if ( (st0_tag == TAG_Zero) &&
405 ((loaded_tag == TAG_Valid)
406 || (loaded_tag == TAG_Special
407 && isdenormal(&loaded_data))) )
408 {
409 if ( FPU_divide_by_zero(0, getsign(&loaded_data))
410 < 0 )
411 {
412 /* We use the fact here that the unmasked
413 exception in the loaded data was for a
414 denormal operand */
415 /* Restore the state of the denormal op bit */
416 partial_status &= ~SW_Denorm_Op;
417 partial_status |= status1 & SW_Denorm_Op;
418 }
419 else
420 setsign(st0_ptr, getsign(&loaded_data));
421 }
422 }
423 goto reg_mem_instr_done;
424 }
425 284
426 switch ( (FPU_modrm >> 3) & 7 ) 285 if ((addr_modes.default_mode & SIXTEEN)
427 { 286 ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX))
428 case 0: /* fadd */ 287 data_address =
429 clear_C1(); 288 FPU_get_address_16(FPU_modrm, &FPU_EIP,
430 FPU_add(&loaded_data, loaded_tag, 0, control_word); 289 &data_sel_off, addr_modes);
431 break; 290 else
432 case 1: /* fmul */ 291 data_address =
433 clear_C1(); 292 FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
434 FPU_mul(&loaded_data, loaded_tag, 0, control_word); 293 addr_modes);
435 break; 294
436 case 2: /* fcom */ 295 if (addr_modes.default_mode) {
437 FPU_compare_st_data(&loaded_data, loaded_tag); 296 if (FPU_EIP - 1 > code_limit)
438 break; 297 math_abort(FPU_info, SIGSEGV);
439 case 3: /* fcomp */
440 if ( !FPU_compare_st_data(&loaded_data, loaded_tag)
441 && !unmasked )
442 FPU_pop();
443 break;
444 case 4: /* fsub */
445 clear_C1();
446 FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word);
447 break;
448 case 5: /* fsubr */
449 clear_C1();
450 FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
451 break;
452 case 6: /* fdiv */
453 clear_C1();
454 FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word);
455 break;
456 case 7: /* fdivr */
457 clear_C1();
458 if ( st0_tag == TAG_Zero )
459 partial_status = status1; /* Undo any denorm tag,
460 zero-divide has priority. */
461 FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
462 break;
463 } 298 }
464 } 299
465 else 300 if (!(byte1 & 1)) {
466 { 301 unsigned short status1 = partial_status;
467 if ( (FPU_modrm & 0x30) == 0x10 ) 302
468 { 303 st0_ptr = &st(0);
469 /* The instruction is fcom or fcomp */ 304 st0_tag = FPU_gettag0();
470 EXCEPTION(EX_StackUnder); 305
471 setcc(SW_C3 | SW_C2 | SW_C0); 306 /* Stack underflow has priority */
472 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) 307 if (NOT_EMPTY_ST0) {
473 FPU_pop(); /* fcomp */ 308 if (addr_modes.default_mode & PROTECTED) {
309 /* This table works for 16 and 32 bit protected mode */
310 if (access_limit <
311 data_sizes_16[(byte1 >> 1) & 3])
312 math_abort(FPU_info, SIGSEGV);
313 }
314
315 unmasked = 0; /* Do this here to stop compiler warnings. */
316 switch ((byte1 >> 1) & 3) {
317 case 0:
318 unmasked =
319 FPU_load_single((float __user *)
320 data_address,
321 &loaded_data);
322 loaded_tag = unmasked & 0xff;
323 unmasked &= ~0xff;
324 break;
325 case 1:
326 loaded_tag =
327 FPU_load_int32((long __user *)
328 data_address,
329 &loaded_data);
330 break;
331 case 2:
332 unmasked =
333 FPU_load_double((double __user *)
334 data_address,
335 &loaded_data);
336 loaded_tag = unmasked & 0xff;
337 unmasked &= ~0xff;
338 break;
339 case 3:
340 default: /* Used here to suppress gcc warnings. */
341 loaded_tag =
342 FPU_load_int16((short __user *)
343 data_address,
344 &loaded_data);
345 break;
346 }
347
348 /* No more access to user memory, it is safe
349 to use static data now */
350
351 /* NaN operands have the next priority. */
352 /* We have to delay looking at st(0) until after
353 loading the data, because that data might contain an SNaN */
354 if (((st0_tag == TAG_Special) && isNaN(st0_ptr))
355 || ((loaded_tag == TAG_Special)
356 && isNaN(&loaded_data))) {
357 /* Restore the status word; we might have loaded a
358 denormal. */
359 partial_status = status1;
360 if ((FPU_modrm & 0x30) == 0x10) {
361 /* fcom or fcomp */
362 EXCEPTION(EX_Invalid);
363 setcc(SW_C3 | SW_C2 | SW_C0);
364 if ((FPU_modrm & 0x08)
365 && (control_word &
366 CW_Invalid))
367 FPU_pop(); /* fcomp, masked, so we pop. */
368 } else {
369 if (loaded_tag == TAG_Special)
370 loaded_tag =
371 FPU_Special
372 (&loaded_data);
373#ifdef PECULIAR_486
374 /* This is not really needed, but gives behaviour
375 identical to an 80486 */
376 if ((FPU_modrm & 0x28) == 0x20)
377 /* fdiv or fsub */
378 real_2op_NaN
379 (&loaded_data,
380 loaded_tag, 0,
381 &loaded_data);
382 else
383#endif /* PECULIAR_486 */
384 /* fadd, fdivr, fmul, or fsubr */
385 real_2op_NaN
386 (&loaded_data,
387 loaded_tag, 0,
388 st0_ptr);
389 }
390 goto reg_mem_instr_done;
391 }
392
393 if (unmasked && !((FPU_modrm & 0x30) == 0x10)) {
394 /* Is not a comparison instruction. */
395 if ((FPU_modrm & 0x38) == 0x38) {
396 /* fdivr */
397 if ((st0_tag == TAG_Zero) &&
398 ((loaded_tag == TAG_Valid)
399 || (loaded_tag ==
400 TAG_Special
401 &&
402 isdenormal
403 (&loaded_data)))) {
404 if (FPU_divide_by_zero
405 (0,
406 getsign
407 (&loaded_data))
408 < 0) {
409 /* We use the fact here that the unmasked
410 exception in the loaded data was for a
411 denormal operand */
412 /* Restore the state of the denormal op bit */
413 partial_status
414 &=
415 ~SW_Denorm_Op;
416 partial_status
417 |=
418 status1 &
419 SW_Denorm_Op;
420 } else
421 setsign(st0_ptr,
422 getsign
423 (&loaded_data));
424 }
425 }
426 goto reg_mem_instr_done;
427 }
428
429 switch ((FPU_modrm >> 3) & 7) {
430 case 0: /* fadd */
431 clear_C1();
432 FPU_add(&loaded_data, loaded_tag, 0,
433 control_word);
434 break;
435 case 1: /* fmul */
436 clear_C1();
437 FPU_mul(&loaded_data, loaded_tag, 0,
438 control_word);
439 break;
440 case 2: /* fcom */
441 FPU_compare_st_data(&loaded_data,
442 loaded_tag);
443 break;
444 case 3: /* fcomp */
445 if (!FPU_compare_st_data
446 (&loaded_data, loaded_tag)
447 && !unmasked)
448 FPU_pop();
449 break;
450 case 4: /* fsub */
451 clear_C1();
452 FPU_sub(LOADED | loaded_tag,
453 (int)&loaded_data,
454 control_word);
455 break;
456 case 5: /* fsubr */
457 clear_C1();
458 FPU_sub(REV | LOADED | loaded_tag,
459 (int)&loaded_data,
460 control_word);
461 break;
462 case 6: /* fdiv */
463 clear_C1();
464 FPU_div(LOADED | loaded_tag,
465 (int)&loaded_data,
466 control_word);
467 break;
468 case 7: /* fdivr */
469 clear_C1();
470 if (st0_tag == TAG_Zero)
471 partial_status = status1; /* Undo any denorm tag,
472 zero-divide has priority. */
473 FPU_div(REV | LOADED | loaded_tag,
474 (int)&loaded_data,
475 control_word);
476 break;
477 }
478 } else {
479 if ((FPU_modrm & 0x30) == 0x10) {
480 /* The instruction is fcom or fcomp */
481 EXCEPTION(EX_StackUnder);
482 setcc(SW_C3 | SW_C2 | SW_C0);
483 if ((FPU_modrm & 0x08)
484 && (control_word & CW_Invalid))
485 FPU_pop(); /* fcomp */
486 } else
487 FPU_stack_underflow();
488 }
489 reg_mem_instr_done:
490 operand_address = data_sel_off;
491 } else {
492 if (!(no_ip_update =
493 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6))
494 >> 1, addr_modes, data_address))) {
495 operand_address = data_sel_off;
496 }
474 } 497 }
475 else
476 FPU_stack_underflow();
477 }
478 reg_mem_instr_done:
479 operand_address = data_sel_off;
480 }
481 else
482 {
483 if ( !(no_ip_update =
484 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1,
485 addr_modes, data_address)) )
486 {
487 operand_address = data_sel_off;
488 }
489 }
490 498
491 } 499 } else {
492 else 500 /* None of these instructions access user memory */
493 { 501 u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
494 /* None of these instructions access user memory */
495 u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
496 502
497#ifdef PECULIAR_486 503#ifdef PECULIAR_486
498 /* This is supposed to be undefined, but a real 80486 seems 504 /* This is supposed to be undefined, but a real 80486 seems
499 to do this: */ 505 to do this: */
500 operand_address.offset = 0; 506 operand_address.offset = 0;
501 operand_address.selector = FPU_DS; 507 operand_address.selector = FPU_DS;
502#endif /* PECULIAR_486 */ 508#endif /* PECULIAR_486 */
503 509
504 st0_ptr = &st(0); 510 st0_ptr = &st(0);
505 st0_tag = FPU_gettag0(); 511 st0_tag = FPU_gettag0();
506 switch ( type_table[(int) instr_index] ) 512 switch (type_table[(int)instr_index]) {
507 { 513 case _NONE_: /* also _REGIc: _REGIn */
508 case _NONE_: /* also _REGIc: _REGIn */ 514 break;
509 break; 515 case _REG0_:
510 case _REG0_: 516 if (!NOT_EMPTY_ST0) {
511 if ( !NOT_EMPTY_ST0 ) 517 FPU_stack_underflow();
512 { 518 goto FPU_instruction_done;
513 FPU_stack_underflow(); 519 }
514 goto FPU_instruction_done; 520 break;
515 } 521 case _REGIi:
516 break; 522 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
517 case _REGIi: 523 FPU_stack_underflow_i(FPU_rm);
518 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 524 goto FPU_instruction_done;
519 { 525 }
520 FPU_stack_underflow_i(FPU_rm); 526 break;
521 goto FPU_instruction_done; 527 case _REGIp:
522 } 528 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
523 break; 529 FPU_stack_underflow_pop(FPU_rm);
524 case _REGIp: 530 goto FPU_instruction_done;
525 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 531 }
526 { 532 break;
527 FPU_stack_underflow_pop(FPU_rm); 533 case _REGI_:
528 goto FPU_instruction_done; 534 if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
529 } 535 FPU_stack_underflow();
530 break; 536 goto FPU_instruction_done;
531 case _REGI_: 537 }
532 if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) 538 break;
533 { 539 case _PUSH_: /* Only used by the fld st(i) instruction */
534 FPU_stack_underflow(); 540 break;
535 goto FPU_instruction_done; 541 case _null_:
536 } 542 FPU_illegal();
537 break; 543 goto FPU_instruction_done;
538 case _PUSH_: /* Only used by the fld st(i) instruction */ 544 default:
539 break; 545 EXCEPTION(EX_INTERNAL | 0x111);
540 case _null_: 546 goto FPU_instruction_done;
541 FPU_illegal(); 547 }
542 goto FPU_instruction_done; 548 (*st_instr_table[(int)instr_index]) ();
543 default:
544 EXCEPTION(EX_INTERNAL|0x111);
545 goto FPU_instruction_done;
546 }
547 (*st_instr_table[(int) instr_index])();
548 549
549FPU_instruction_done: 550 FPU_instruction_done:
550 ; 551 ;
551 } 552 }
552 553
553 if ( ! no_ip_update ) 554 if (!no_ip_update)
554 instruction_address = entry_sel_off; 555 instruction_address = entry_sel_off;
555 556
556FPU_fwait_done: 557 FPU_fwait_done:
557 558
558#ifdef DEBUG 559#ifdef DEBUG
559 RE_ENTRANT_CHECK_OFF; 560 RE_ENTRANT_CHECK_OFF;
560 FPU_printall(); 561 FPU_printall();
561 RE_ENTRANT_CHECK_ON; 562 RE_ENTRANT_CHECK_ON;
562#endif /* DEBUG */ 563#endif /* DEBUG */
563 564
564 if (FPU_lookahead && !need_resched()) 565 if (FPU_lookahead && !need_resched()) {
565 { 566 FPU_ORIG_EIP = FPU_EIP - code_base;
566 FPU_ORIG_EIP = FPU_EIP - code_base; 567 if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
567 if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP, 568 &addr_modes.override))
568 &addr_modes.override) ) 569 goto do_another_FPU_instruction;
569 goto do_another_FPU_instruction; 570 }
570 }
571 571
572 if ( addr_modes.default_mode ) 572 if (addr_modes.default_mode)
573 FPU_EIP -= code_base; 573 FPU_EIP -= code_base;
574 574
575 RE_ENTRANT_CHECK_OFF; 575 RE_ENTRANT_CHECK_OFF;
576} 576}
577 577
578
579/* Support for prefix bytes is not yet complete. To properly handle 578/* Support for prefix bytes is not yet complete. To properly handle
580 all prefix bytes, further changes are needed in the emulator code 579 all prefix bytes, further changes are needed in the emulator code
581 which accesses user address space. Access to separate segments is 580 which accesses user address space. Access to separate segments is
582 important for msdos emulation. */ 581 important for msdos emulation. */
583static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, 582static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
584 overrides *override) 583 overrides * override)
585{ 584{
586 u_char byte; 585 u_char byte;
587 u_char __user *ip = *fpu_eip; 586 u_char __user *ip = *fpu_eip;
588 587
589 *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */ 588 *override = (overrides) {
590 589 0, 0, PREFIX_DEFAULT}; /* defaults */
591 RE_ENTRANT_CHECK_OFF; 590
592 FPU_code_access_ok(1); 591 RE_ENTRANT_CHECK_OFF;
593 FPU_get_user(byte, ip); 592 FPU_code_access_ok(1);
594 RE_ENTRANT_CHECK_ON; 593 FPU_get_user(byte, ip);
595 594 RE_ENTRANT_CHECK_ON;
596 while ( 1 ) 595
597 { 596 while (1) {
598 switch ( byte ) 597 switch (byte) {
599 { 598 case ADDR_SIZE_PREFIX:
600 case ADDR_SIZE_PREFIX: 599 override->address_size = ADDR_SIZE_PREFIX;
601 override->address_size = ADDR_SIZE_PREFIX; 600 goto do_next_byte;
602 goto do_next_byte; 601
603 602 case OP_SIZE_PREFIX:
604 case OP_SIZE_PREFIX: 603 override->operand_size = OP_SIZE_PREFIX;
605 override->operand_size = OP_SIZE_PREFIX; 604 goto do_next_byte;
606 goto do_next_byte; 605
607 606 case PREFIX_CS:
608 case PREFIX_CS: 607 override->segment = PREFIX_CS_;
609 override->segment = PREFIX_CS_; 608 goto do_next_byte;
610 goto do_next_byte; 609 case PREFIX_ES:
611 case PREFIX_ES: 610 override->segment = PREFIX_ES_;
612 override->segment = PREFIX_ES_; 611 goto do_next_byte;
613 goto do_next_byte; 612 case PREFIX_SS:
614 case PREFIX_SS: 613 override->segment = PREFIX_SS_;
615 override->segment = PREFIX_SS_; 614 goto do_next_byte;
616 goto do_next_byte; 615 case PREFIX_FS:
617 case PREFIX_FS: 616 override->segment = PREFIX_FS_;
618 override->segment = PREFIX_FS_; 617 goto do_next_byte;
619 goto do_next_byte; 618 case PREFIX_GS:
620 case PREFIX_GS: 619 override->segment = PREFIX_GS_;
621 override->segment = PREFIX_GS_; 620 goto do_next_byte;
622 goto do_next_byte; 621 case PREFIX_DS:
623 case PREFIX_DS: 622 override->segment = PREFIX_DS_;
624 override->segment = PREFIX_DS_; 623 goto do_next_byte;
625 goto do_next_byte;
626 624
627/* lock is not a valid prefix for FPU instructions, 625/* lock is not a valid prefix for FPU instructions,
628 let the cpu handle it to generate a SIGILL. */ 626 let the cpu handle it to generate a SIGILL. */
629/* case PREFIX_LOCK: */ 627/* case PREFIX_LOCK: */
630 628
631 /* rep.. prefixes have no meaning for FPU instructions */ 629 /* rep.. prefixes have no meaning for FPU instructions */
632 case PREFIX_REPE: 630 case PREFIX_REPE:
633 case PREFIX_REPNE: 631 case PREFIX_REPNE:
634 632
635 do_next_byte: 633 do_next_byte:
636 ip++; 634 ip++;
637 RE_ENTRANT_CHECK_OFF; 635 RE_ENTRANT_CHECK_OFF;
638 FPU_code_access_ok(1); 636 FPU_code_access_ok(1);
639 FPU_get_user(byte, ip); 637 FPU_get_user(byte, ip);
640 RE_ENTRANT_CHECK_ON; 638 RE_ENTRANT_CHECK_ON;
641 break; 639 break;
642 case FWAIT_OPCODE: 640 case FWAIT_OPCODE:
643 *Byte = byte; 641 *Byte = byte;
644 return 1; 642 return 1;
645 default: 643 default:
646 if ( (byte & 0xf8) == 0xd8 ) 644 if ((byte & 0xf8) == 0xd8) {
647 { 645 *Byte = byte;
648 *Byte = byte; 646 *fpu_eip = ip;
649 *fpu_eip = ip; 647 return 1;
650 return 1; 648 } else {
651 } 649 /* Not a valid sequence of prefix bytes followed by
652 else 650 an FPU instruction. */
653 { 651 *Byte = byte; /* Needed for error message. */
654 /* Not a valid sequence of prefix bytes followed by 652 return 0;
655 an FPU instruction. */ 653 }
656 *Byte = byte; /* Needed for error message. */ 654 }
657 return 0;
658 }
659 } 655 }
660 }
661} 656}
662 657
663 658void math_abort(struct info *info, unsigned int signal)
664void math_abort(struct info * info, unsigned int signal)
665{ 659{
666 FPU_EIP = FPU_ORIG_EIP; 660 FPU_EIP = FPU_ORIG_EIP;
667 current->thread.trap_no = 16; 661 current->thread.trap_no = 16;
668 current->thread.error_code = 0; 662 current->thread.error_code = 0;
669 send_sig(signal,current,1); 663 send_sig(signal, current, 1);
670 RE_ENTRANT_CHECK_OFF; 664 RE_ENTRANT_CHECK_OFF;
671 __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); 665 __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4));
672#ifdef PARANOID 666#ifdef PARANOID
673 printk("ERROR: wm-FPU-emu math_abort failed!\n"); 667 printk("ERROR: wm-FPU-emu math_abort failed!\n");
674#endif /* PARANOID */ 668#endif /* PARANOID */
675} 669}
676 670
677
678
679#define S387 ((struct i387_soft_struct *)s387) 671#define S387 ((struct i387_soft_struct *)s387)
680#define sstatus_word() \ 672#define sstatus_word() \
681 ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) 673 ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
682 674
683int restore_i387_soft(void *s387, struct _fpstate __user *buf) 675int fpregs_soft_set(struct task_struct *target,
676 const struct user_regset *regset,
677 unsigned int pos, unsigned int count,
678 const void *kbuf, const void __user *ubuf)
684{ 679{
685 u_char __user *d = (u_char __user *)buf; 680 struct i387_soft_struct *s387 = &target->thread.i387.soft;
686 int offset, other, i, tags, regnr, tag, newtop; 681 void *space = s387->st_space;
687 682 int ret;
688 RE_ENTRANT_CHECK_OFF; 683 int offset, other, i, tags, regnr, tag, newtop;
689 FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10); 684
690 if (__copy_from_user(&S387->cwd, d, 7*4)) 685 RE_ENTRANT_CHECK_OFF;
691 return -1; 686 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
692 RE_ENTRANT_CHECK_ON; 687 offsetof(struct i387_soft_struct, st_space));
693 688 RE_ENTRANT_CHECK_ON;
694 d += 7*4; 689
695 690 if (ret)
696 S387->ftop = (S387->swd >> SW_Top_Shift) & 7; 691 return ret;
697 offset = (S387->ftop & 7) * 10; 692
698 other = 80 - offset; 693 S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
699 694 offset = (S387->ftop & 7) * 10;
700 RE_ENTRANT_CHECK_OFF; 695 other = 80 - offset;
701 /* Copy all registers in stack order. */ 696
702 if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other)) 697 RE_ENTRANT_CHECK_OFF;
703 return -1; 698
704 if ( offset ) 699 /* Copy all registers in stack order. */
705 if (__copy_from_user((u_char *)&S387->st_space, d+other, offset)) 700 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
706 return -1; 701 space + offset, 0, other);
707 RE_ENTRANT_CHECK_ON; 702 if (!ret && offset)
708 703 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
709 /* The tags may need to be corrected now. */ 704 space, 0, offset);
710 tags = S387->twd; 705
711 newtop = S387->ftop; 706 RE_ENTRANT_CHECK_ON;
712 for ( i = 0; i < 8; i++ ) 707
713 { 708 /* The tags may need to be corrected now. */
714 regnr = (i+newtop) & 7; 709 tags = S387->twd;
715 if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty ) 710 newtop = S387->ftop;
716 { 711 for (i = 0; i < 8; i++) {
717 /* The loaded data over-rides all other cases. */ 712 regnr = (i + newtop) & 7;
718 tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr)); 713 if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) {
719 tags &= ~(3 << (regnr*2)); 714 /* The loaded data over-rides all other cases. */
720 tags |= (tag & 3) << (regnr*2); 715 tag =
716 FPU_tagof((FPU_REG *) ((u_char *) S387->st_space +
717 10 * regnr));
718 tags &= ~(3 << (regnr * 2));
719 tags |= (tag & 3) << (regnr * 2);
720 }
721 } 721 }
722 } 722 S387->twd = tags;
723 S387->twd = tags;
724 723
725 return 0; 724 return ret;
726} 725}
727 726
728 727int fpregs_soft_get(struct task_struct *target,
729int save_i387_soft(void *s387, struct _fpstate __user * buf) 728 const struct user_regset *regset,
729 unsigned int pos, unsigned int count,
730 void *kbuf, void __user *ubuf)
730{ 731{
731 u_char __user *d = (u_char __user *)buf; 732 struct i387_soft_struct *s387 = &target->thread.i387.soft;
732 int offset = (S387->ftop & 7) * 10, other = 80 - offset; 733 const void *space = s387->st_space;
734 int ret;
735 int offset = (S387->ftop & 7) * 10, other = 80 - offset;
736
737 RE_ENTRANT_CHECK_OFF;
733 738
734 RE_ENTRANT_CHECK_OFF;
735 FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10);
736#ifdef PECULIAR_486 739#ifdef PECULIAR_486
737 S387->cwd &= ~0xe080; 740 S387->cwd &= ~0xe080;
738 /* An 80486 sets nearly all of the reserved bits to 1. */ 741 /* An 80486 sets nearly all of the reserved bits to 1. */
739 S387->cwd |= 0xffff0040; 742 S387->cwd |= 0xffff0040;
740 S387->swd = sstatus_word() | 0xffff0000; 743 S387->swd = sstatus_word() | 0xffff0000;
741 S387->twd |= 0xffff0000; 744 S387->twd |= 0xffff0000;
742 S387->fcs &= ~0xf8000000; 745 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000; 746 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */ 747#endif /* PECULIAR_486 */
745 if (__copy_to_user(d, &S387->cwd, 7*4)) 748
746 return -1; 749 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
747 RE_ENTRANT_CHECK_ON; 750 offsetof(struct i387_soft_struct, st_space));
748 751
749 d += 7*4; 752 /* Copy all registers in stack order. */
750 753 if (!ret)
751 RE_ENTRANT_CHECK_OFF; 754 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
752 /* Copy all registers in stack order. */ 755 space + offset, 0, other);
753 if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other)) 756 if (!ret)
754 return -1; 757 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
755 if ( offset ) 758 space, 0, offset);
756 if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset)) 759
757 return -1; 760 RE_ENTRANT_CHECK_ON;
758 RE_ENTRANT_CHECK_ON; 761
759 762 return ret;
760 return 1;
761} 763}
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
index e3b5d465587f..233e5af566f5 100644
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -16,128 +16,115 @@
16#include "status_w.h" 16#include "status_w.h"
17#include "reg_constant.h" 17#include "reg_constant.h"
18 18
19
20static void fchs(FPU_REG *st0_ptr, u_char st0tag) 19static void fchs(FPU_REG *st0_ptr, u_char st0tag)
21{ 20{
22 if ( st0tag ^ TAG_Empty ) 21 if (st0tag ^ TAG_Empty) {
23 { 22 signbyte(st0_ptr) ^= SIGN_NEG;
24 signbyte(st0_ptr) ^= SIGN_NEG; 23 clear_C1();
25 clear_C1(); 24 } else
26 } 25 FPU_stack_underflow();
27 else
28 FPU_stack_underflow();
29} 26}
30 27
31
32static void fabs(FPU_REG *st0_ptr, u_char st0tag) 28static void fabs(FPU_REG *st0_ptr, u_char st0tag)
33{ 29{
34 if ( st0tag ^ TAG_Empty ) 30 if (st0tag ^ TAG_Empty) {
35 { 31 setpositive(st0_ptr);
36 setpositive(st0_ptr); 32 clear_C1();
37 clear_C1(); 33 } else
38 } 34 FPU_stack_underflow();
39 else
40 FPU_stack_underflow();
41} 35}
42 36
43
44static void ftst_(FPU_REG *st0_ptr, u_char st0tag) 37static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
45{ 38{
46 switch (st0tag) 39 switch (st0tag) {
47 { 40 case TAG_Zero:
48 case TAG_Zero:
49 setcc(SW_C3);
50 break;
51 case TAG_Valid:
52 if (getsign(st0_ptr) == SIGN_POS)
53 setcc(0);
54 else
55 setcc(SW_C0);
56 break;
57 case TAG_Special:
58 switch ( FPU_Special(st0_ptr) )
59 {
60 case TW_Denormal:
61 if (getsign(st0_ptr) == SIGN_POS)
62 setcc(0);
63 else
64 setcc(SW_C0);
65 if ( denormal_operand() < 0 )
66 {
67#ifdef PECULIAR_486
68 /* This is weird! */
69 if (getsign(st0_ptr) == SIGN_POS)
70 setcc(SW_C3); 41 setcc(SW_C3);
42 break;
43 case TAG_Valid:
44 if (getsign(st0_ptr) == SIGN_POS)
45 setcc(0);
46 else
47 setcc(SW_C0);
48 break;
49 case TAG_Special:
50 switch (FPU_Special(st0_ptr)) {
51 case TW_Denormal:
52 if (getsign(st0_ptr) == SIGN_POS)
53 setcc(0);
54 else
55 setcc(SW_C0);
56 if (denormal_operand() < 0) {
57#ifdef PECULIAR_486
58 /* This is weird! */
59 if (getsign(st0_ptr) == SIGN_POS)
60 setcc(SW_C3);
71#endif /* PECULIAR_486 */ 61#endif /* PECULIAR_486 */
72 return; 62 return;
73 } 63 }
74 break; 64 break;
75 case TW_NaN: 65 case TW_NaN:
76 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ 66 setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
77 EXCEPTION(EX_Invalid); 67 EXCEPTION(EX_Invalid);
78 break; 68 break;
79 case TW_Infinity: 69 case TW_Infinity:
80 if (getsign(st0_ptr) == SIGN_POS) 70 if (getsign(st0_ptr) == SIGN_POS)
81 setcc(0); 71 setcc(0);
82 else 72 else
83 setcc(SW_C0); 73 setcc(SW_C0);
84 break; 74 break;
85 default: 75 default:
86 setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ 76 setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
87 EXCEPTION(EX_INTERNAL|0x14); 77 EXCEPTION(EX_INTERNAL | 0x14);
88 break; 78 break;
79 }
80 break;
81 case TAG_Empty:
82 setcc(SW_C0 | SW_C2 | SW_C3);
83 EXCEPTION(EX_StackUnder);
84 break;
89 } 85 }
90 break;
91 case TAG_Empty:
92 setcc(SW_C0|SW_C2|SW_C3);
93 EXCEPTION(EX_StackUnder);
94 break;
95 }
96} 86}
97 87
98
99static void fxam(FPU_REG *st0_ptr, u_char st0tag) 88static void fxam(FPU_REG *st0_ptr, u_char st0tag)
100{ 89{
101 int c = 0; 90 int c = 0;
102 switch (st0tag) 91 switch (st0tag) {
103 { 92 case TAG_Empty:
104 case TAG_Empty: 93 c = SW_C3 | SW_C0;
105 c = SW_C3|SW_C0; 94 break;
106 break; 95 case TAG_Zero:
107 case TAG_Zero: 96 c = SW_C3;
108 c = SW_C3; 97 break;
109 break; 98 case TAG_Valid:
110 case TAG_Valid: 99 c = SW_C2;
111 c = SW_C2; 100 break;
112 break; 101 case TAG_Special:
113 case TAG_Special: 102 switch (FPU_Special(st0_ptr)) {
114 switch ( FPU_Special(st0_ptr) ) 103 case TW_Denormal:
115 { 104 c = SW_C2 | SW_C3; /* Denormal */
116 case TW_Denormal: 105 break;
117 c = SW_C2|SW_C3; /* Denormal */ 106 case TW_NaN:
118 break; 107 /* We also use NaN for unsupported types. */
119 case TW_NaN: 108 if ((st0_ptr->sigh & 0x80000000)
120 /* We also use NaN for unsupported types. */ 109 && (exponent(st0_ptr) == EXP_OVER))
121 if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) ) 110 c = SW_C0;
122 c = SW_C0; 111 break;
123 break; 112 case TW_Infinity:
124 case TW_Infinity: 113 c = SW_C2 | SW_C0;
125 c = SW_C2|SW_C0; 114 break;
126 break; 115 }
127 } 116 }
128 } 117 if (getsign(st0_ptr) == SIGN_NEG)
129 if ( getsign(st0_ptr) == SIGN_NEG ) 118 c |= SW_C1;
130 c |= SW_C1; 119 setcc(c);
131 setcc(c);
132} 120}
133 121
134
135static FUNC_ST0 const fp_etc_table[] = { 122static FUNC_ST0 const fp_etc_table[] = {
136 fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal, 123 fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
137 ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal 124 ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
138}; 125};
139 126
140void FPU_etc(void) 127void FPU_etc(void)
141{ 128{
142 (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0()); 129 (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0());
143} 130}
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 37a8a7fe7e2b..aa49b6a0d850 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -66,7 +66,7 @@ extern int FPU_Special(FPU_REG const *ptr);
66extern int isNaN(FPU_REG const *ptr); 66extern int isNaN(FPU_REG const *ptr);
67extern void FPU_pop(void); 67extern void FPU_pop(void);
68extern int FPU_empty_i(int stnr); 68extern int FPU_empty_i(int stnr);
69extern int FPU_stackoverflow(FPU_REG **st_new_ptr); 69extern int FPU_stackoverflow(FPU_REG ** st_new_ptr);
70extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr); 70extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
71extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag); 71extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
72extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag); 72extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
@@ -75,21 +75,23 @@ extern void FPU_triga(void);
75extern void FPU_trigb(void); 75extern void FPU_trigb(void);
76/* get_address.c */ 76/* get_address.c */
77extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, 77extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
78 struct address *addr, fpu_addr_modes addr_modes); 78 struct address *addr,
79 fpu_addr_modes addr_modes);
79extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, 80extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
80 struct address *addr, fpu_addr_modes addr_modes); 81 struct address *addr,
82 fpu_addr_modes addr_modes);
81/* load_store.c */ 83/* load_store.c */
82extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes, 84extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
83 void __user *data_address); 85 void __user * data_address);
84/* poly_2xm1.c */ 86/* poly_2xm1.c */
85extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result); 87extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result);
86/* poly_atan.c */ 88/* poly_atan.c */
87extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, 89extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
88 u_char st1_tag); 90 u_char st1_tag);
89/* poly_l2.c */ 91/* poly_l2.c */
90extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign); 92extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
91extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1, 93extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
92 FPU_REG *d); 94 FPU_REG * d);
93/* poly_sin.c */ 95/* poly_sin.c */
94extern void poly_sine(FPU_REG *st0_ptr); 96extern void poly_sine(FPU_REG *st0_ptr);
95extern void poly_cos(FPU_REG *st0_ptr); 97extern void poly_cos(FPU_REG *st0_ptr);
@@ -117,10 +119,13 @@ extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
117extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data); 119extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
118extern int FPU_load_bcd(u_char __user *s); 120extern int FPU_load_bcd(u_char __user *s);
119extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, 121extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
120 long double __user *d); 122 long double __user * d);
121extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat); 123extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag,
122extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single); 124 double __user * dfloat);
123extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d); 125extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag,
126 float __user * single);
127extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag,
128 long long __user * d);
124extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d); 129extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
125extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); 130extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
126extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); 131extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
@@ -137,4 +142,3 @@ extern int FPU_div(int flags, int regrm, int control_w);
137/* reg_convert.c */ 142/* reg_convert.c */
138extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x); 143extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
139#endif /* _FPU_PROTO_H */ 144#endif /* _FPU_PROTO_H */
140
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c
index cb436fe20e4c..d9c657cd7746 100644
--- a/arch/x86/math-emu/fpu_tags.c
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -14,114 +14,102 @@
14#include "fpu_system.h" 14#include "fpu_system.h"
15#include "exception.h" 15#include "exception.h"
16 16
17
18void FPU_pop(void) 17void FPU_pop(void)
19{ 18{
20 fpu_tag_word |= 3 << ((top & 7)*2); 19 fpu_tag_word |= 3 << ((top & 7) * 2);
21 top++; 20 top++;
22} 21}
23 22
24
25int FPU_gettag0(void) 23int FPU_gettag0(void)
26{ 24{
27 return (fpu_tag_word >> ((top & 7)*2)) & 3; 25 return (fpu_tag_word >> ((top & 7) * 2)) & 3;
28} 26}
29 27
30
31int FPU_gettagi(int stnr) 28int FPU_gettagi(int stnr)
32{ 29{
33 return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3; 30 return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3;
34} 31}
35 32
36
37int FPU_gettag(int regnr) 33int FPU_gettag(int regnr)
38{ 34{
39 return (fpu_tag_word >> ((regnr & 7)*2)) & 3; 35 return (fpu_tag_word >> ((regnr & 7) * 2)) & 3;
40} 36}
41 37
42
43void FPU_settag0(int tag) 38void FPU_settag0(int tag)
44{ 39{
45 int regnr = top; 40 int regnr = top;
46 regnr &= 7; 41 regnr &= 7;
47 fpu_tag_word &= ~(3 << (regnr*2)); 42 fpu_tag_word &= ~(3 << (regnr * 2));
48 fpu_tag_word |= (tag & 3) << (regnr*2); 43 fpu_tag_word |= (tag & 3) << (regnr * 2);
49} 44}
50 45
51
52void FPU_settagi(int stnr, int tag) 46void FPU_settagi(int stnr, int tag)
53{ 47{
54 int regnr = stnr+top; 48 int regnr = stnr + top;
55 regnr &= 7; 49 regnr &= 7;
56 fpu_tag_word &= ~(3 << (regnr*2)); 50 fpu_tag_word &= ~(3 << (regnr * 2));
57 fpu_tag_word |= (tag & 3) << (regnr*2); 51 fpu_tag_word |= (tag & 3) << (regnr * 2);
58} 52}
59 53
60
61void FPU_settag(int regnr, int tag) 54void FPU_settag(int regnr, int tag)
62{ 55{
63 regnr &= 7; 56 regnr &= 7;
64 fpu_tag_word &= ~(3 << (regnr*2)); 57 fpu_tag_word &= ~(3 << (regnr * 2));
65 fpu_tag_word |= (tag & 3) << (regnr*2); 58 fpu_tag_word |= (tag & 3) << (regnr * 2);
66} 59}
67 60
68
69int FPU_Special(FPU_REG const *ptr) 61int FPU_Special(FPU_REG const *ptr)
70{ 62{
71 int exp = exponent(ptr); 63 int exp = exponent(ptr);
72 64
73 if ( exp == EXP_BIAS+EXP_UNDER ) 65 if (exp == EXP_BIAS + EXP_UNDER)
74 return TW_Denormal; 66 return TW_Denormal;
75 else if ( exp != EXP_BIAS+EXP_OVER ) 67 else if (exp != EXP_BIAS + EXP_OVER)
76 return TW_NaN; 68 return TW_NaN;
77 else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) ) 69 else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0))
78 return TW_Infinity; 70 return TW_Infinity;
79 return TW_NaN; 71 return TW_NaN;
80} 72}
81 73
82
83int isNaN(FPU_REG const *ptr) 74int isNaN(FPU_REG const *ptr)
84{ 75{
85 return ( (exponent(ptr) == EXP_BIAS+EXP_OVER) 76 return ((exponent(ptr) == EXP_BIAS + EXP_OVER)
86 && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) ); 77 && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)));
87} 78}
88 79
89
90int FPU_empty_i(int stnr) 80int FPU_empty_i(int stnr)
91{ 81{
92 int regnr = (top+stnr) & 7; 82 int regnr = (top + stnr) & 7;
93 83
94 return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty; 84 return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty;
95} 85}
96 86
97 87int FPU_stackoverflow(FPU_REG ** st_new_ptr)
98int FPU_stackoverflow(FPU_REG **st_new_ptr)
99{ 88{
100 *st_new_ptr = &st(-1); 89 *st_new_ptr = &st(-1);
101 90
102 return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty; 91 return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty;
103} 92}
104 93
105
106void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr) 94void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
107{ 95{
108 reg_copy(r, &st(stnr)); 96 reg_copy(r, &st(stnr));
109 FPU_settagi(stnr, tag); 97 FPU_settagi(stnr, tag);
110} 98}
111 99
112void FPU_copy_to_reg1(FPU_REG const *r, u_char tag) 100void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
113{ 101{
114 reg_copy(r, &st(1)); 102 reg_copy(r, &st(1));
115 FPU_settagi(1, tag); 103 FPU_settagi(1, tag);
116} 104}
117 105
118void FPU_copy_to_reg0(FPU_REG const *r, u_char tag) 106void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
119{ 107{
120 int regnr = top; 108 int regnr = top;
121 regnr &= 7; 109 regnr &= 7;
122 110
123 reg_copy(r, &st(0)); 111 reg_copy(r, &st(0));
124 112
125 fpu_tag_word &= ~(3 << (regnr*2)); 113 fpu_tag_word &= ~(3 << (regnr * 2));
126 fpu_tag_word |= (tag & 3) << (regnr*2); 114 fpu_tag_word |= (tag & 3) << (regnr * 2);
127} 115}
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 403cbde1d425..ecd06680581c 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -15,11 +15,10 @@
15#include "fpu_emu.h" 15#include "fpu_emu.h"
16#include "status_w.h" 16#include "status_w.h"
17#include "control_w.h" 17#include "control_w.h"
18#include "reg_constant.h" 18#include "reg_constant.h"
19 19
20static void rem_kernel(unsigned long long st0, unsigned long long *y, 20static void rem_kernel(unsigned long long st0, unsigned long long *y,
21 unsigned long long st1, 21 unsigned long long st1, unsigned long long q, int n);
22 unsigned long long q, int n);
23 22
24#define BETTER_THAN_486 23#define BETTER_THAN_486
25 24
@@ -33,788 +32,706 @@ static void rem_kernel(unsigned long long st0, unsigned long long *y,
33 precision of the result sometimes degrades to about 63.9 bits */ 32 precision of the result sometimes degrades to about 63.9 bits */
34static int trig_arg(FPU_REG *st0_ptr, int even) 33static int trig_arg(FPU_REG *st0_ptr, int even)
35{ 34{
36 FPU_REG tmp; 35 FPU_REG tmp;
37 u_char tmptag; 36 u_char tmptag;
38 unsigned long long q; 37 unsigned long long q;
39 int old_cw = control_word, saved_status = partial_status; 38 int old_cw = control_word, saved_status = partial_status;
40 int tag, st0_tag = TAG_Valid; 39 int tag, st0_tag = TAG_Valid;
41 40
42 if ( exponent(st0_ptr) >= 63 ) 41 if (exponent(st0_ptr) >= 63) {
43 { 42 partial_status |= SW_C2; /* Reduction incomplete. */
44 partial_status |= SW_C2; /* Reduction incomplete. */ 43 return -1;
45 return -1; 44 }
46 }
47
48 control_word &= ~CW_RC;
49 control_word |= RC_CHOP;
50
51 setpositive(st0_ptr);
52 tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
53 SIGN_POS);
54
55 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
56 to 2^64 */
57 q = significand(&tmp);
58 if ( q )
59 {
60 rem_kernel(significand(st0_ptr),
61 &significand(&tmp),
62 significand(&CONST_PI2),
63 q, exponent(st0_ptr) - exponent(&CONST_PI2));
64 setexponent16(&tmp, exponent(&CONST_PI2));
65 st0_tag = FPU_normalize(&tmp);
66 FPU_copy_to_reg0(&tmp, st0_tag);
67 }
68
69 if ( (even && !(q & 1)) || (!even && (q & 1)) )
70 {
71 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION);
72 45
73#ifdef BETTER_THAN_486 46 control_word &= ~CW_RC;
74 /* So far, the results are exact but based upon a 64 bit 47 control_word |= RC_CHOP;
75 precision approximation to pi/2. The technique used 48
76 now is equivalent to using an approximation to pi/2 which 49 setpositive(st0_ptr);
77 is accurate to about 128 bits. */ 50 tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
78 if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) ) 51 SIGN_POS);
79 { 52
80 /* This code gives the effect of having pi/2 to better than 53 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
81 128 bits precision. */ 54 to 2^64 */
82 55 q = significand(&tmp);
83 significand(&tmp) = q + 1; 56 if (q) {
84 setexponent16(&tmp, 63); 57 rem_kernel(significand(st0_ptr),
85 FPU_normalize(&tmp); 58 &significand(&tmp),
86 tmptag = 59 significand(&CONST_PI2),
87 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS, 60 q, exponent(st0_ptr) - exponent(&CONST_PI2));
88 exponent(&CONST_PI2extra) + exponent(&tmp)); 61 setexponent16(&tmp, exponent(&CONST_PI2));
89 setsign(&tmp, getsign(&CONST_PI2extra)); 62 st0_tag = FPU_normalize(&tmp);
90 st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); 63 FPU_copy_to_reg0(&tmp, st0_tag);
91 if ( signnegative(st0_ptr) )
92 {
93 /* CONST_PI2extra is negative, so the result of the addition
94 can be negative. This means that the argument is actually
95 in a different quadrant. The correction is always < pi/2,
96 so it can't overflow into yet another quadrant. */
97 setpositive(st0_ptr);
98 q++;
99 }
100 } 64 }
65
66 if ((even && !(q & 1)) || (!even && (q & 1))) {
67 st0_tag =
68 FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2,
69 FULL_PRECISION);
70
71#ifdef BETTER_THAN_486
72 /* So far, the results are exact but based upon a 64 bit
73 precision approximation to pi/2. The technique used
74 now is equivalent to using an approximation to pi/2 which
75 is accurate to about 128 bits. */
76 if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)
77 || (q > 1)) {
78 /* This code gives the effect of having pi/2 to better than
79 128 bits precision. */
80
81 significand(&tmp) = q + 1;
82 setexponent16(&tmp, 63);
83 FPU_normalize(&tmp);
84 tmptag =
85 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
86 FULL_PRECISION, SIGN_POS,
87 exponent(&CONST_PI2extra) +
88 exponent(&tmp));
89 setsign(&tmp, getsign(&CONST_PI2extra));
90 st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
91 if (signnegative(st0_ptr)) {
92 /* CONST_PI2extra is negative, so the result of the addition
93 can be negative. This means that the argument is actually
94 in a different quadrant. The correction is always < pi/2,
95 so it can't overflow into yet another quadrant. */
96 setpositive(st0_ptr);
97 q++;
98 }
99 }
101#endif /* BETTER_THAN_486 */ 100#endif /* BETTER_THAN_486 */
102 } 101 }
103#ifdef BETTER_THAN_486 102#ifdef BETTER_THAN_486
104 else 103 else {
105 { 104 /* So far, the results are exact but based upon a 64 bit
106 /* So far, the results are exact but based upon a 64 bit 105 precision approximation to pi/2. The technique used
107 precision approximation to pi/2. The technique used 106 now is equivalent to using an approximation to pi/2 which
108 now is equivalent to using an approximation to pi/2 which 107 is accurate to about 128 bits. */
109 is accurate to about 128 bits. */ 108 if (((q > 0)
110 if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) 109 && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
111 || (q > 1) ) 110 || (q > 1)) {
112 { 111 /* This code gives the effect of having p/2 to better than
113 /* This code gives the effect of having p/2 to better than 112 128 bits precision. */
114 128 bits precision. */ 113
115 114 significand(&tmp) = q;
116 significand(&tmp) = q; 115 setexponent16(&tmp, 63);
117 setexponent16(&tmp, 63); 116 FPU_normalize(&tmp); /* This must return TAG_Valid */
118 FPU_normalize(&tmp); /* This must return TAG_Valid */ 117 tmptag =
119 tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, 118 FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
120 SIGN_POS, 119 FULL_PRECISION, SIGN_POS,
121 exponent(&CONST_PI2extra) + exponent(&tmp)); 120 exponent(&CONST_PI2extra) +
122 setsign(&tmp, getsign(&CONST_PI2extra)); 121 exponent(&tmp));
123 st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp, 122 setsign(&tmp, getsign(&CONST_PI2extra));
124 FULL_PRECISION); 123 st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp,
125 if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) && 124 FULL_PRECISION);
126 ((st0_ptr->sigh > CONST_PI2.sigh) 125 if ((exponent(st0_ptr) == exponent(&CONST_PI2)) &&
127 || ((st0_ptr->sigh == CONST_PI2.sigh) 126 ((st0_ptr->sigh > CONST_PI2.sigh)
128 && (st0_ptr->sigl > CONST_PI2.sigl))) ) 127 || ((st0_ptr->sigh == CONST_PI2.sigh)
129 { 128 && (st0_ptr->sigl > CONST_PI2.sigl)))) {
130 /* CONST_PI2extra is negative, so the result of the 129 /* CONST_PI2extra is negative, so the result of the
131 subtraction can be larger than pi/2. This means 130 subtraction can be larger than pi/2. This means
132 that the argument is actually in a different quadrant. 131 that the argument is actually in a different quadrant.
133 The correction is always < pi/2, so it can't overflow 132 The correction is always < pi/2, so it can't overflow
134 into yet another quadrant. */ 133 into yet another quadrant. */
135 st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, 134 st0_tag =
136 FULL_PRECISION); 135 FPU_sub(REV | LOADED | TAG_Valid,
137 q++; 136 (int)&CONST_PI2, FULL_PRECISION);
138 } 137 q++;
138 }
139 }
139 } 140 }
140 }
141#endif /* BETTER_THAN_486 */ 141#endif /* BETTER_THAN_486 */
142 142
143 FPU_settag0(st0_tag); 143 FPU_settag0(st0_tag);
144 control_word = old_cw; 144 control_word = old_cw;
145 partial_status = saved_status & ~SW_C2; /* Reduction complete. */ 145 partial_status = saved_status & ~SW_C2; /* Reduction complete. */
146 146
147 return (q & 3) | even; 147 return (q & 3) | even;
148} 148}
149 149
150
151/* Convert a long to register */ 150/* Convert a long to register */
152static void convert_l2reg(long const *arg, int deststnr) 151static void convert_l2reg(long const *arg, int deststnr)
153{ 152{
154 int tag; 153 int tag;
155 long num = *arg; 154 long num = *arg;
156 u_char sign; 155 u_char sign;
157 FPU_REG *dest = &st(deststnr); 156 FPU_REG *dest = &st(deststnr);
158
159 if (num == 0)
160 {
161 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
162 return;
163 }
164
165 if (num > 0)
166 { sign = SIGN_POS; }
167 else
168 { num = -num; sign = SIGN_NEG; }
169
170 dest->sigh = num;
171 dest->sigl = 0;
172 setexponent16(dest, 31);
173 tag = FPU_normalize(dest);
174 FPU_settagi(deststnr, tag);
175 setsign(dest, sign);
176 return;
177}
178 157
158 if (num == 0) {
159 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
160 return;
161 }
162
163 if (num > 0) {
164 sign = SIGN_POS;
165 } else {
166 num = -num;
167 sign = SIGN_NEG;
168 }
169
170 dest->sigh = num;
171 dest->sigl = 0;
172 setexponent16(dest, 31);
173 tag = FPU_normalize(dest);
174 FPU_settagi(deststnr, tag);
175 setsign(dest, sign);
176 return;
177}
179 178
180static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag) 179static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
181{ 180{
182 if ( st0_tag == TAG_Empty ) 181 if (st0_tag == TAG_Empty)
183 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 182 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
184 else if ( st0_tag == TW_NaN ) 183 else if (st0_tag == TW_NaN)
185 real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ 184 real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */
186#ifdef PARANOID 185#ifdef PARANOID
187 else 186 else
188 EXCEPTION(EX_INTERNAL|0x0112); 187 EXCEPTION(EX_INTERNAL | 0x0112);
189#endif /* PARANOID */ 188#endif /* PARANOID */
190} 189}
191 190
192
193static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag) 191static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
194{ 192{
195 int isNaN; 193 int isNaN;
196 194
197 switch ( st0_tag ) 195 switch (st0_tag) {
198 { 196 case TW_NaN:
199 case TW_NaN: 197 isNaN = (exponent(st0_ptr) == EXP_OVER)
200 isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000); 198 && (st0_ptr->sigh & 0x80000000);
201 if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */ 199 if (isNaN && !(st0_ptr->sigh & 0x40000000)) { /* Signaling ? */
202 { 200 EXCEPTION(EX_Invalid);
203 EXCEPTION(EX_Invalid); 201 if (control_word & CW_Invalid) {
204 if ( control_word & CW_Invalid ) 202 /* The masked response */
205 { 203 /* Convert to a QNaN */
206 /* The masked response */ 204 st0_ptr->sigh |= 0x40000000;
207 /* Convert to a QNaN */ 205 push();
208 st0_ptr->sigh |= 0x40000000; 206 FPU_copy_to_reg0(st0_ptr, TAG_Special);
209 push(); 207 }
210 FPU_copy_to_reg0(st0_ptr, TAG_Special); 208 } else if (isNaN) {
211 } 209 /* A QNaN */
212 } 210 push();
213 else if ( isNaN ) 211 FPU_copy_to_reg0(st0_ptr, TAG_Special);
214 { 212 } else {
215 /* A QNaN */ 213 /* pseudoNaN or other unsupported */
216 push(); 214 EXCEPTION(EX_Invalid);
217 FPU_copy_to_reg0(st0_ptr, TAG_Special); 215 if (control_word & CW_Invalid) {
218 } 216 /* The masked response */
219 else 217 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
220 { 218 push();
221 /* pseudoNaN or other unsupported */ 219 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
222 EXCEPTION(EX_Invalid); 220 }
223 if ( control_word & CW_Invalid ) 221 }
224 { 222 break; /* return with a NaN in st(0) */
225 /* The masked response */
226 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
227 push();
228 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
229 }
230 }
231 break; /* return with a NaN in st(0) */
232#ifdef PARANOID 223#ifdef PARANOID
233 default: 224 default:
234 EXCEPTION(EX_INTERNAL|0x0112); 225 EXCEPTION(EX_INTERNAL | 0x0112);
235#endif /* PARANOID */ 226#endif /* PARANOID */
236 } 227 }
237} 228}
238 229
239
240/*---------------------------------------------------------------------------*/ 230/*---------------------------------------------------------------------------*/
241 231
242static void f2xm1(FPU_REG *st0_ptr, u_char tag) 232static void f2xm1(FPU_REG *st0_ptr, u_char tag)
243{ 233{
244 FPU_REG a; 234 FPU_REG a;
245 235
246 clear_C1(); 236 clear_C1();
247 237
248 if ( tag == TAG_Valid ) 238 if (tag == TAG_Valid) {
249 { 239 /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
250 /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ 240 if (exponent(st0_ptr) < 0) {
251 if ( exponent(st0_ptr) < 0 ) 241 denormal_arg:
252 {
253 denormal_arg:
254 242
255 FPU_to_exp16(st0_ptr, &a); 243 FPU_to_exp16(st0_ptr, &a);
256 244
257 /* poly_2xm1(x) requires 0 < st(0) < 1. */ 245 /* poly_2xm1(x) requires 0 < st(0) < 1. */
258 poly_2xm1(getsign(st0_ptr), &a, st0_ptr); 246 poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
247 }
248 set_precision_flag_up(); /* 80486 appears to always do this */
249 return;
259 } 250 }
260 set_precision_flag_up(); /* 80486 appears to always do this */
261 return;
262 }
263 251
264 if ( tag == TAG_Zero ) 252 if (tag == TAG_Zero)
265 return; 253 return;
266 254
267 if ( tag == TAG_Special ) 255 if (tag == TAG_Special)
268 tag = FPU_Special(st0_ptr); 256 tag = FPU_Special(st0_ptr);
269 257
270 switch ( tag ) 258 switch (tag) {
271 { 259 case TW_Denormal:
272 case TW_Denormal: 260 if (denormal_operand() < 0)
273 if ( denormal_operand() < 0 ) 261 return;
274 return; 262 goto denormal_arg;
275 goto denormal_arg; 263 case TW_Infinity:
276 case TW_Infinity: 264 if (signnegative(st0_ptr)) {
277 if ( signnegative(st0_ptr) ) 265 /* -infinity gives -1 (p16-10) */
278 { 266 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
279 /* -infinity gives -1 (p16-10) */ 267 setnegative(st0_ptr);
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid); 268 }
281 setnegative(st0_ptr); 269 return;
270 default:
271 single_arg_error(st0_ptr, tag);
282 } 272 }
283 return;
284 default:
285 single_arg_error(st0_ptr, tag);
286 }
287} 273}
288 274
289
290static void fptan(FPU_REG *st0_ptr, u_char st0_tag) 275static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
291{ 276{
292 FPU_REG *st_new_ptr; 277 FPU_REG *st_new_ptr;
293 int q; 278 int q;
294 u_char arg_sign = getsign(st0_ptr); 279 u_char arg_sign = getsign(st0_ptr);
295 280
296 /* Stack underflow has higher priority */ 281 /* Stack underflow has higher priority */
297 if ( st0_tag == TAG_Empty ) 282 if (st0_tag == TAG_Empty) {
298 { 283 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
299 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 284 if (control_word & CW_Invalid) {
300 if ( control_word & CW_Invalid ) 285 st_new_ptr = &st(-1);
301 { 286 push();
302 st_new_ptr = &st(-1); 287 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
303 push(); 288 }
304 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ 289 return;
305 } 290 }
306 return; 291
307 } 292 if (STACK_OVERFLOW) {
308 293 FPU_stack_overflow();
309 if ( STACK_OVERFLOW ) 294 return;
310 { FPU_stack_overflow(); return; }
311
312 if ( st0_tag == TAG_Valid )
313 {
314 if ( exponent(st0_ptr) > -40 )
315 {
316 if ( (q = trig_arg(st0_ptr, 0)) == -1 )
317 {
318 /* Operand is out of range */
319 return;
320 }
321
322 poly_tan(st0_ptr);
323 setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
324 set_precision_flag_up(); /* We do not really know if up or down */
325 } 295 }
326 else
327 {
328 /* For a small arg, the result == the argument */
329 /* Underflow may happen */
330 296
331 denormal_arg: 297 if (st0_tag == TAG_Valid) {
298 if (exponent(st0_ptr) > -40) {
299 if ((q = trig_arg(st0_ptr, 0)) == -1) {
300 /* Operand is out of range */
301 return;
302 }
303
304 poly_tan(st0_ptr);
305 setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
306 set_precision_flag_up(); /* We do not really know if up or down */
307 } else {
308 /* For a small arg, the result == the argument */
309 /* Underflow may happen */
310
311 denormal_arg:
312
313 FPU_to_exp16(st0_ptr, st0_ptr);
332 314
333 FPU_to_exp16(st0_ptr, st0_ptr); 315 st0_tag =
334 316 FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
335 st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); 317 FPU_settag0(st0_tag);
336 FPU_settag0(st0_tag); 318 }
319 push();
320 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
321 return;
337 } 322 }
338 push();
339 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
340 return;
341 }
342
343 if ( st0_tag == TAG_Zero )
344 {
345 push();
346 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
347 setcc(0);
348 return;
349 }
350
351 if ( st0_tag == TAG_Special )
352 st0_tag = FPU_Special(st0_ptr);
353
354 if ( st0_tag == TW_Denormal )
355 {
356 if ( denormal_operand() < 0 )
357 return;
358 323
359 goto denormal_arg; 324 if (st0_tag == TAG_Zero) {
360 } 325 push();
361 326 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
362 if ( st0_tag == TW_Infinity ) 327 setcc(0);
363 { 328 return;
364 /* The 80486 treats infinity as an invalid operand */ 329 }
365 if ( arith_invalid(0) >= 0 ) 330
366 { 331 if (st0_tag == TAG_Special)
367 st_new_ptr = &st(-1); 332 st0_tag = FPU_Special(st0_ptr);
368 push(); 333
369 arith_invalid(0); 334 if (st0_tag == TW_Denormal) {
335 if (denormal_operand() < 0)
336 return;
337
338 goto denormal_arg;
370 } 339 }
371 return;
372 }
373 340
374 single_arg_2_error(st0_ptr, st0_tag); 341 if (st0_tag == TW_Infinity) {
375} 342 /* The 80486 treats infinity as an invalid operand */
343 if (arith_invalid(0) >= 0) {
344 st_new_ptr = &st(-1);
345 push();
346 arith_invalid(0);
347 }
348 return;
349 }
376 350
351 single_arg_2_error(st0_ptr, st0_tag);
352}
377 353
378static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) 354static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
379{ 355{
380 FPU_REG *st_new_ptr; 356 FPU_REG *st_new_ptr;
381 u_char sign; 357 u_char sign;
382 register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ 358 register FPU_REG *st1_ptr = st0_ptr; /* anticipate */
383
384 if ( STACK_OVERFLOW )
385 { FPU_stack_overflow(); return; }
386
387 clear_C1();
388
389 if ( st0_tag == TAG_Valid )
390 {
391 long e;
392
393 push();
394 sign = getsign(st1_ptr);
395 reg_copy(st1_ptr, st_new_ptr);
396 setexponent16(st_new_ptr, exponent(st_new_ptr));
397
398 denormal_arg:
399
400 e = exponent16(st_new_ptr);
401 convert_l2reg(&e, 1);
402 setexponentpos(st_new_ptr, 0);
403 setsign(st_new_ptr, sign);
404 FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
405 return;
406 }
407 else if ( st0_tag == TAG_Zero )
408 {
409 sign = getsign(st0_ptr);
410
411 if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 )
412 return;
413 359
414 push(); 360 if (STACK_OVERFLOW) {
415 FPU_copy_to_reg0(&CONST_Z, TAG_Zero); 361 FPU_stack_overflow();
416 setsign(st_new_ptr, sign); 362 return;
417 return; 363 }
418 }
419 364
420 if ( st0_tag == TAG_Special ) 365 clear_C1();
421 st0_tag = FPU_Special(st0_ptr);
422 366
423 if ( st0_tag == TW_Denormal ) 367 if (st0_tag == TAG_Valid) {
424 { 368 long e;
425 if (denormal_operand() < 0 )
426 return;
427 369
428 push(); 370 push();
429 sign = getsign(st1_ptr); 371 sign = getsign(st1_ptr);
430 FPU_to_exp16(st1_ptr, st_new_ptr); 372 reg_copy(st1_ptr, st_new_ptr);
431 goto denormal_arg; 373 setexponent16(st_new_ptr, exponent(st_new_ptr));
432 } 374
433 else if ( st0_tag == TW_Infinity ) 375 denormal_arg:
434 { 376
435 sign = getsign(st0_ptr); 377 e = exponent16(st_new_ptr);
436 setpositive(st0_ptr); 378 convert_l2reg(&e, 1);
437 push(); 379 setexponentpos(st_new_ptr, 0);
438 FPU_copy_to_reg0(&CONST_INF, TAG_Special); 380 setsign(st_new_ptr, sign);
439 setsign(st_new_ptr, sign); 381 FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
440 return; 382 return;
441 } 383 } else if (st0_tag == TAG_Zero) {
442 else if ( st0_tag == TW_NaN ) 384 sign = getsign(st0_ptr);
443 { 385
444 if ( real_1op_NaN(st0_ptr) < 0 ) 386 if (FPU_divide_by_zero(0, SIGN_NEG) < 0)
445 return; 387 return;
446 388
447 push(); 389 push();
448 FPU_copy_to_reg0(st0_ptr, TAG_Special); 390 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
449 return; 391 setsign(st_new_ptr, sign);
450 } 392 return;
451 else if ( st0_tag == TAG_Empty ) 393 }
452 { 394
453 /* Is this the correct behaviour? */ 395 if (st0_tag == TAG_Special)
454 if ( control_word & EX_Invalid ) 396 st0_tag = FPU_Special(st0_ptr);
455 { 397
456 FPU_stack_underflow(); 398 if (st0_tag == TW_Denormal) {
457 push(); 399 if (denormal_operand() < 0)
458 FPU_stack_underflow(); 400 return;
401
402 push();
403 sign = getsign(st1_ptr);
404 FPU_to_exp16(st1_ptr, st_new_ptr);
405 goto denormal_arg;
406 } else if (st0_tag == TW_Infinity) {
407 sign = getsign(st0_ptr);
408 setpositive(st0_ptr);
409 push();
410 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
411 setsign(st_new_ptr, sign);
412 return;
413 } else if (st0_tag == TW_NaN) {
414 if (real_1op_NaN(st0_ptr) < 0)
415 return;
416
417 push();
418 FPU_copy_to_reg0(st0_ptr, TAG_Special);
419 return;
420 } else if (st0_tag == TAG_Empty) {
421 /* Is this the correct behaviour? */
422 if (control_word & EX_Invalid) {
423 FPU_stack_underflow();
424 push();
425 FPU_stack_underflow();
426 } else
427 EXCEPTION(EX_StackUnder);
459 } 428 }
460 else
461 EXCEPTION(EX_StackUnder);
462 }
463#ifdef PARANOID 429#ifdef PARANOID
464 else 430 else
465 EXCEPTION(EX_INTERNAL | 0x119); 431 EXCEPTION(EX_INTERNAL | 0x119);
466#endif /* PARANOID */ 432#endif /* PARANOID */
467} 433}
468 434
469
470static void fdecstp(void) 435static void fdecstp(void)
471{ 436{
472 clear_C1(); 437 clear_C1();
473 top--; 438 top--;
474} 439}
475 440
476static void fincstp(void) 441static void fincstp(void)
477{ 442{
478 clear_C1(); 443 clear_C1();
479 top++; 444 top++;
480} 445}
481 446
482
483static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag) 447static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
484{ 448{
485 int expon; 449 int expon;
486 450
487 clear_C1(); 451 clear_C1();
488
489 if ( st0_tag == TAG_Valid )
490 {
491 u_char tag;
492
493 if (signnegative(st0_ptr))
494 {
495 arith_invalid(0); /* sqrt(negative) is invalid */
496 return;
497 }
498 452
499 /* make st(0) in [1.0 .. 4.0) */ 453 if (st0_tag == TAG_Valid) {
500 expon = exponent(st0_ptr); 454 u_char tag;
501 455
502 denormal_arg: 456 if (signnegative(st0_ptr)) {
503 457 arith_invalid(0); /* sqrt(negative) is invalid */
504 setexponent16(st0_ptr, (expon & 1)); 458 return;
505 459 }
506 /* Do the computation, the sign of the result will be positive. */ 460
507 tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); 461 /* make st(0) in [1.0 .. 4.0) */
508 addexponent(st0_ptr, expon >> 1); 462 expon = exponent(st0_ptr);
509 FPU_settag0(tag); 463
510 return; 464 denormal_arg:
511 } 465
512 466 setexponent16(st0_ptr, (expon & 1));
513 if ( st0_tag == TAG_Zero ) 467
514 return; 468 /* Do the computation, the sign of the result will be positive. */
515 469 tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
516 if ( st0_tag == TAG_Special ) 470 addexponent(st0_ptr, expon >> 1);
517 st0_tag = FPU_Special(st0_ptr); 471 FPU_settag0(tag);
518 472 return;
519 if ( st0_tag == TW_Infinity )
520 {
521 if ( signnegative(st0_ptr) )
522 arith_invalid(0); /* sqrt(-Infinity) is invalid */
523 return;
524 }
525 else if ( st0_tag == TW_Denormal )
526 {
527 if (signnegative(st0_ptr))
528 {
529 arith_invalid(0); /* sqrt(negative) is invalid */
530 return;
531 } 473 }
532 474
533 if ( denormal_operand() < 0 ) 475 if (st0_tag == TAG_Zero)
534 return; 476 return;
535 477
536 FPU_to_exp16(st0_ptr, st0_ptr); 478 if (st0_tag == TAG_Special)
479 st0_tag = FPU_Special(st0_ptr);
537 480
538 expon = exponent16(st0_ptr); 481 if (st0_tag == TW_Infinity) {
482 if (signnegative(st0_ptr))
483 arith_invalid(0); /* sqrt(-Infinity) is invalid */
484 return;
485 } else if (st0_tag == TW_Denormal) {
486 if (signnegative(st0_ptr)) {
487 arith_invalid(0); /* sqrt(negative) is invalid */
488 return;
489 }
539 490
540 goto denormal_arg; 491 if (denormal_operand() < 0)
541 } 492 return;
542 493
543 single_arg_error(st0_ptr, st0_tag); 494 FPU_to_exp16(st0_ptr, st0_ptr);
544 495
545} 496 expon = exponent16(st0_ptr);
497
498 goto denormal_arg;
499 }
546 500
501 single_arg_error(st0_ptr, st0_tag);
502
503}
547 504
548static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) 505static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
549{ 506{
550 int flags, tag; 507 int flags, tag;
551 508
552 if ( st0_tag == TAG_Valid ) 509 if (st0_tag == TAG_Valid) {
553 { 510 u_char sign;
554 u_char sign;
555 511
556 denormal_arg: 512 denormal_arg:
557 513
558 sign = getsign(st0_ptr); 514 sign = getsign(st0_ptr);
559 515
560 if (exponent(st0_ptr) > 63) 516 if (exponent(st0_ptr) > 63)
561 return; 517 return;
518
519 if (st0_tag == TW_Denormal) {
520 if (denormal_operand() < 0)
521 return;
522 }
523
524 /* Fortunately, this can't overflow to 2^64 */
525 if ((flags = FPU_round_to_int(st0_ptr, st0_tag)))
526 set_precision_flag(flags);
562 527
563 if ( st0_tag == TW_Denormal ) 528 setexponent16(st0_ptr, 63);
564 { 529 tag = FPU_normalize(st0_ptr);
565 if (denormal_operand() < 0 ) 530 setsign(st0_ptr, sign);
566 return; 531 FPU_settag0(tag);
532 return;
567 } 533 }
568 534
569 /* Fortunately, this can't overflow to 2^64 */ 535 if (st0_tag == TAG_Zero)
570 if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) ) 536 return;
571 set_precision_flag(flags);
572
573 setexponent16(st0_ptr, 63);
574 tag = FPU_normalize(st0_ptr);
575 setsign(st0_ptr, sign);
576 FPU_settag0(tag);
577 return;
578 }
579
580 if ( st0_tag == TAG_Zero )
581 return;
582
583 if ( st0_tag == TAG_Special )
584 st0_tag = FPU_Special(st0_ptr);
585
586 if ( st0_tag == TW_Denormal )
587 goto denormal_arg;
588 else if ( st0_tag == TW_Infinity )
589 return;
590 else
591 single_arg_error(st0_ptr, st0_tag);
592}
593 537
538 if (st0_tag == TAG_Special)
539 st0_tag = FPU_Special(st0_ptr);
540
541 if (st0_tag == TW_Denormal)
542 goto denormal_arg;
543 else if (st0_tag == TW_Infinity)
544 return;
545 else
546 single_arg_error(st0_ptr, st0_tag);
547}
594 548
595static int fsin(FPU_REG *st0_ptr, u_char tag) 549static int fsin(FPU_REG *st0_ptr, u_char tag)
596{ 550{
597 u_char arg_sign = getsign(st0_ptr); 551 u_char arg_sign = getsign(st0_ptr);
598 552
599 if ( tag == TAG_Valid ) 553 if (tag == TAG_Valid) {
600 { 554 int q;
601 int q; 555
602 556 if (exponent(st0_ptr) > -40) {
603 if ( exponent(st0_ptr) > -40 ) 557 if ((q = trig_arg(st0_ptr, 0)) == -1) {
604 { 558 /* Operand is out of range */
605 if ( (q = trig_arg(st0_ptr, 0)) == -1 ) 559 return 1;
606 { 560 }
607 /* Operand is out of range */ 561
608 return 1; 562 poly_sine(st0_ptr);
609 } 563
610 564 if (q & 2)
611 poly_sine(st0_ptr); 565 changesign(st0_ptr);
612 566
613 if (q & 2) 567 setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
614 changesign(st0_ptr); 568
615 569 /* We do not really know if up or down */
616 setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); 570 set_precision_flag_up();
617 571 return 0;
618 /* We do not really know if up or down */ 572 } else {
619 set_precision_flag_up(); 573 /* For a small arg, the result == the argument */
620 return 0; 574 set_precision_flag_up(); /* Must be up. */
575 return 0;
576 }
621 } 577 }
622 else 578
623 { 579 if (tag == TAG_Zero) {
624 /* For a small arg, the result == the argument */ 580 setcc(0);
625 set_precision_flag_up(); /* Must be up. */ 581 return 0;
626 return 0;
627 } 582 }
628 }
629
630 if ( tag == TAG_Zero )
631 {
632 setcc(0);
633 return 0;
634 }
635
636 if ( tag == TAG_Special )
637 tag = FPU_Special(st0_ptr);
638
639 if ( tag == TW_Denormal )
640 {
641 if ( denormal_operand() < 0 )
642 return 1;
643
644 /* For a small arg, the result == the argument */
645 /* Underflow may happen */
646 FPU_to_exp16(st0_ptr, st0_ptr);
647
648 tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
649
650 FPU_settag0(tag);
651
652 return 0;
653 }
654 else if ( tag == TW_Infinity )
655 {
656 /* The 80486 treats infinity as an invalid operand */
657 arith_invalid(0);
658 return 1;
659 }
660 else
661 {
662 single_arg_error(st0_ptr, tag);
663 return 1;
664 }
665}
666 583
584 if (tag == TAG_Special)
585 tag = FPU_Special(st0_ptr);
586
587 if (tag == TW_Denormal) {
588 if (denormal_operand() < 0)
589 return 1;
590
591 /* For a small arg, the result == the argument */
592 /* Underflow may happen */
593 FPU_to_exp16(st0_ptr, st0_ptr);
594
595 tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
596
597 FPU_settag0(tag);
598
599 return 0;
600 } else if (tag == TW_Infinity) {
601 /* The 80486 treats infinity as an invalid operand */
602 arith_invalid(0);
603 return 1;
604 } else {
605 single_arg_error(st0_ptr, tag);
606 return 1;
607 }
608}
667 609
668static int f_cos(FPU_REG *st0_ptr, u_char tag) 610static int f_cos(FPU_REG *st0_ptr, u_char tag)
669{ 611{
670 u_char st0_sign; 612 u_char st0_sign;
671 613
672 st0_sign = getsign(st0_ptr); 614 st0_sign = getsign(st0_ptr);
673
674 if ( tag == TAG_Valid )
675 {
676 int q;
677
678 if ( exponent(st0_ptr) > -40 )
679 {
680 if ( (exponent(st0_ptr) < 0)
681 || ((exponent(st0_ptr) == 0)
682 && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) )
683 {
684 poly_cos(st0_ptr);
685
686 /* We do not really know if up or down */
687 set_precision_flag_down();
688
689 return 0;
690 }
691 else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 )
692 {
693 poly_sine(st0_ptr);
694
695 if ((q+1) & 2)
696 changesign(st0_ptr);
697
698 /* We do not really know if up or down */
699 set_precision_flag_down();
700
701 return 0;
702 }
703 else
704 {
705 /* Operand is out of range */
706 return 1;
707 }
708 }
709 else
710 {
711 denormal_arg:
712 615
713 setcc(0); 616 if (tag == TAG_Valid) {
714 FPU_copy_to_reg0(&CONST_1, TAG_Valid); 617 int q;
618
619 if (exponent(st0_ptr) > -40) {
620 if ((exponent(st0_ptr) < 0)
621 || ((exponent(st0_ptr) == 0)
622 && (significand(st0_ptr) <=
623 0xc90fdaa22168c234LL))) {
624 poly_cos(st0_ptr);
625
626 /* We do not really know if up or down */
627 set_precision_flag_down();
628
629 return 0;
630 } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) {
631 poly_sine(st0_ptr);
632
633 if ((q + 1) & 2)
634 changesign(st0_ptr);
635
636 /* We do not really know if up or down */
637 set_precision_flag_down();
638
639 return 0;
640 } else {
641 /* Operand is out of range */
642 return 1;
643 }
644 } else {
645 denormal_arg:
646
647 setcc(0);
648 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
715#ifdef PECULIAR_486 649#ifdef PECULIAR_486
716 set_precision_flag_down(); /* 80486 appears to do this. */ 650 set_precision_flag_down(); /* 80486 appears to do this. */
717#else 651#else
718 set_precision_flag_up(); /* Must be up. */ 652 set_precision_flag_up(); /* Must be up. */
719#endif /* PECULIAR_486 */ 653#endif /* PECULIAR_486 */
720 return 0; 654 return 0;
655 }
656 } else if (tag == TAG_Zero) {
657 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
658 setcc(0);
659 return 0;
721 } 660 }
722 }
723 else if ( tag == TAG_Zero )
724 {
725 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
726 setcc(0);
727 return 0;
728 }
729
730 if ( tag == TAG_Special )
731 tag = FPU_Special(st0_ptr);
732
733 if ( tag == TW_Denormal )
734 {
735 if ( denormal_operand() < 0 )
736 return 1;
737
738 goto denormal_arg;
739 }
740 else if ( tag == TW_Infinity )
741 {
742 /* The 80486 treats infinity as an invalid operand */
743 arith_invalid(0);
744 return 1;
745 }
746 else
747 {
748 single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
749 return 1;
750 }
751}
752 661
662 if (tag == TAG_Special)
663 tag = FPU_Special(st0_ptr);
664
665 if (tag == TW_Denormal) {
666 if (denormal_operand() < 0)
667 return 1;
668
669 goto denormal_arg;
670 } else if (tag == TW_Infinity) {
671 /* The 80486 treats infinity as an invalid operand */
672 arith_invalid(0);
673 return 1;
674 } else {
675 single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
676 return 1;
677 }
678}
753 679
754static void fcos(FPU_REG *st0_ptr, u_char st0_tag) 680static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
755{ 681{
756 f_cos(st0_ptr, st0_tag); 682 f_cos(st0_ptr, st0_tag);
757} 683}
758 684
759
760static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) 685static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
761{ 686{
762 FPU_REG *st_new_ptr; 687 FPU_REG *st_new_ptr;
763 FPU_REG arg; 688 FPU_REG arg;
764 u_char tag; 689 u_char tag;
765 690
766 /* Stack underflow has higher priority */ 691 /* Stack underflow has higher priority */
767 if ( st0_tag == TAG_Empty ) 692 if (st0_tag == TAG_Empty) {
768 { 693 FPU_stack_underflow(); /* Puts a QNaN in st(0) */
769 FPU_stack_underflow(); /* Puts a QNaN in st(0) */ 694 if (control_word & CW_Invalid) {
770 if ( control_word & CW_Invalid ) 695 st_new_ptr = &st(-1);
771 { 696 push();
772 st_new_ptr = &st(-1); 697 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
773 push(); 698 }
774 FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ 699 return;
775 } 700 }
776 return; 701
777 } 702 if (STACK_OVERFLOW) {
778 703 FPU_stack_overflow();
779 if ( STACK_OVERFLOW ) 704 return;
780 { FPU_stack_overflow(); return; }
781
782 if ( st0_tag == TAG_Special )
783 tag = FPU_Special(st0_ptr);
784 else
785 tag = st0_tag;
786
787 if ( tag == TW_NaN )
788 {
789 single_arg_2_error(st0_ptr, TW_NaN);
790 return;
791 }
792 else if ( tag == TW_Infinity )
793 {
794 /* The 80486 treats infinity as an invalid operand */
795 if ( arith_invalid(0) >= 0 )
796 {
797 /* Masked response */
798 push();
799 arith_invalid(0);
800 } 705 }
801 return;
802 }
803
804 reg_copy(st0_ptr, &arg);
805 if ( !fsin(st0_ptr, st0_tag) )
806 {
807 push();
808 FPU_copy_to_reg0(&arg, st0_tag);
809 f_cos(&st(0), st0_tag);
810 }
811 else
812 {
813 /* An error, so restore st(0) */
814 FPU_copy_to_reg0(&arg, st0_tag);
815 }
816}
817 706
707 if (st0_tag == TAG_Special)
708 tag = FPU_Special(st0_ptr);
709 else
710 tag = st0_tag;
711
712 if (tag == TW_NaN) {
713 single_arg_2_error(st0_ptr, TW_NaN);
714 return;
715 } else if (tag == TW_Infinity) {
716 /* The 80486 treats infinity as an invalid operand */
717 if (arith_invalid(0) >= 0) {
718 /* Masked response */
719 push();
720 arith_invalid(0);
721 }
722 return;
723 }
724
725 reg_copy(st0_ptr, &arg);
726 if (!fsin(st0_ptr, st0_tag)) {
727 push();
728 FPU_copy_to_reg0(&arg, st0_tag);
729 f_cos(&st(0), st0_tag);
730 } else {
731 /* An error, so restore st(0) */
732 FPU_copy_to_reg0(&arg, st0_tag);
733 }
734}
818 735
819/*---------------------------------------------------------------------------*/ 736/*---------------------------------------------------------------------------*/
820/* The following all require two arguments: st(0) and st(1) */ 737/* The following all require two arguments: st(0) and st(1) */
@@ -826,1020 +743,901 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
826 result must be zero. 743 result must be zero.
827 */ 744 */
828static void rem_kernel(unsigned long long st0, unsigned long long *y, 745static void rem_kernel(unsigned long long st0, unsigned long long *y,
829 unsigned long long st1, 746 unsigned long long st1, unsigned long long q, int n)
830 unsigned long long q, int n)
831{ 747{
832 int dummy; 748 int dummy;
833 unsigned long long x; 749 unsigned long long x;
834 750
835 x = st0 << n; 751 x = st0 << n;
836 752
837 /* Do the required multiplication and subtraction in the one operation */ 753 /* Do the required multiplication and subtraction in the one operation */
838 754
839 /* lsw x -= lsw st1 * lsw q */ 755 /* lsw x -= lsw st1 * lsw q */
840 asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1" 756 asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m"
841 :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]), 757 (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]),
842 "=a" (dummy) 758 "=a"(dummy)
843 :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0]) 759 :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0])
844 :"%dx"); 760 :"%dx");
845 /* msw x -= msw st1 * lsw q */ 761 /* msw x -= msw st1 * lsw q */
846 asm volatile ("mull %3; subl %%eax,%0" 762 asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
847 :"=m" (((unsigned *)&x)[1]), "=a" (dummy) 763 "=a"(dummy)
848 :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0]) 764 :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0])
849 :"%dx"); 765 :"%dx");
850 /* msw x -= lsw st1 * msw q */ 766 /* msw x -= lsw st1 * msw q */
851 asm volatile ("mull %3; subl %%eax,%0" 767 asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
852 :"=m" (((unsigned *)&x)[1]), "=a" (dummy) 768 "=a"(dummy)
853 :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1]) 769 :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1])
854 :"%dx"); 770 :"%dx");
855 771
856 *y = x; 772 *y = x;
857} 773}
858 774
859
860/* Remainder of st(0) / st(1) */ 775/* Remainder of st(0) / st(1) */
861/* This routine produces exact results, i.e. there is never any 776/* This routine produces exact results, i.e. there is never any
862 rounding or truncation, etc of the result. */ 777 rounding or truncation, etc of the result. */
863static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round) 778static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
864{ 779{
865 FPU_REG *st1_ptr = &st(1); 780 FPU_REG *st1_ptr = &st(1);
866 u_char st1_tag = FPU_gettagi(1); 781 u_char st1_tag = FPU_gettagi(1);
867 782
868 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 783 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
869 { 784 FPU_REG tmp, st0, st1;
870 FPU_REG tmp, st0, st1; 785 u_char st0_sign, st1_sign;
871 u_char st0_sign, st1_sign; 786 u_char tmptag;
872 u_char tmptag; 787 int tag;
873 int tag; 788 int old_cw;
874 int old_cw; 789 int expdif;
875 int expdif; 790 long long q;
876 long long q; 791 unsigned short saved_status;
877 unsigned short saved_status; 792 int cc;
878 int cc; 793
879 794 fprem_valid:
880 fprem_valid: 795 /* Convert registers for internal use. */
881 /* Convert registers for internal use. */ 796 st0_sign = FPU_to_exp16(st0_ptr, &st0);
882 st0_sign = FPU_to_exp16(st0_ptr, &st0); 797 st1_sign = FPU_to_exp16(st1_ptr, &st1);
883 st1_sign = FPU_to_exp16(st1_ptr, &st1); 798 expdif = exponent16(&st0) - exponent16(&st1);
884 expdif = exponent16(&st0) - exponent16(&st1); 799
885 800 old_cw = control_word;
886 old_cw = control_word; 801 cc = 0;
887 cc = 0; 802
888 803 /* We want the status following the denorm tests, but don't want
889 /* We want the status following the denorm tests, but don't want 804 the status changed by the arithmetic operations. */
890 the status changed by the arithmetic operations. */ 805 saved_status = partial_status;
891 saved_status = partial_status; 806 control_word &= ~CW_RC;
892 control_word &= ~CW_RC; 807 control_word |= RC_CHOP;
893 control_word |= RC_CHOP; 808
894 809 if (expdif < 64) {
895 if ( expdif < 64 ) 810 /* This should be the most common case */
896 { 811
897 /* This should be the most common case */ 812 if (expdif > -2) {
898 813 u_char sign = st0_sign ^ st1_sign;
899 if ( expdif > -2 ) 814 tag = FPU_u_div(&st0, &st1, &tmp,
900 { 815 PR_64_BITS | RC_CHOP | 0x3f,
901 u_char sign = st0_sign ^ st1_sign; 816 sign);
902 tag = FPU_u_div(&st0, &st1, &tmp, 817 setsign(&tmp, sign);
903 PR_64_BITS | RC_CHOP | 0x3f, 818
904 sign); 819 if (exponent(&tmp) >= 0) {
905 setsign(&tmp, sign); 820 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
906 821 overflow to 2^64 */
907 if ( exponent(&tmp) >= 0 ) 822 q = significand(&tmp);
908 { 823
909 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't 824 rem_kernel(significand(&st0),
910 overflow to 2^64 */ 825 &significand(&tmp),
911 q = significand(&tmp); 826 significand(&st1),
912 827 q, expdif);
913 rem_kernel(significand(&st0), 828
914 &significand(&tmp), 829 setexponent16(&tmp, exponent16(&st1));
915 significand(&st1), 830 } else {
916 q, expdif); 831 reg_copy(&st0, &tmp);
917 832 q = 0;
918 setexponent16(&tmp, exponent16(&st1)); 833 }
919 } 834
920 else 835 if ((round == RC_RND)
921 { 836 && (tmp.sigh & 0xc0000000)) {
922 reg_copy(&st0, &tmp); 837 /* We may need to subtract st(1) once more,
923 q = 0; 838 to get a result <= 1/2 of st(1). */
924 } 839 unsigned long long x;
925 840 expdif =
926 if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) ) 841 exponent16(&st1) - exponent16(&tmp);
927 { 842 if (expdif <= 1) {
928 /* We may need to subtract st(1) once more, 843 if (expdif == 0)
929 to get a result <= 1/2 of st(1). */ 844 x = significand(&st1) -
930 unsigned long long x; 845 significand(&tmp);
931 expdif = exponent16(&st1) - exponent16(&tmp); 846 else /* expdif is 1 */
932 if ( expdif <= 1 ) 847 x = (significand(&st1)
933 { 848 << 1) -
934 if ( expdif == 0 ) 849 significand(&tmp);
935 x = significand(&st1) - significand(&tmp); 850 if ((x < significand(&tmp)) ||
936 else /* expdif is 1 */ 851 /* or equi-distant (from 0 & st(1)) and q is odd */
937 x = (significand(&st1) << 1) - significand(&tmp); 852 ((x == significand(&tmp))
938 if ( (x < significand(&tmp)) || 853 && (q & 1))) {
939 /* or equi-distant (from 0 & st(1)) and q is odd */ 854 st0_sign = !st0_sign;
940 ((x == significand(&tmp)) && (q & 1) ) ) 855 significand(&tmp) = x;
941 { 856 q++;
942 st0_sign = ! st0_sign; 857 }
943 significand(&tmp) = x; 858 }
944 q++; 859 }
860
861 if (q & 4)
862 cc |= SW_C0;
863 if (q & 2)
864 cc |= SW_C3;
865 if (q & 1)
866 cc |= SW_C1;
867 } else {
868 control_word = old_cw;
869 setcc(0);
870 return;
945 } 871 }
946 } 872 } else {
947 } 873 /* There is a large exponent difference ( >= 64 ) */
948 874 /* To make much sense, the code in this section should
949 if (q & 4) cc |= SW_C0; 875 be done at high precision. */
950 if (q & 2) cc |= SW_C3; 876 int exp_1, N;
951 if (q & 1) cc |= SW_C1; 877 u_char sign;
952 } 878
953 else 879 /* prevent overflow here */
954 { 880 /* N is 'a number between 32 and 63' (p26-113) */
955 control_word = old_cw; 881 reg_copy(&st0, &tmp);
956 setcc(0); 882 tmptag = st0_tag;
957 return; 883 N = (expdif & 0x0000001f) + 32; /* This choice gives results
958 } 884 identical to an AMD 486 */
959 } 885 setexponent16(&tmp, N);
960 else 886 exp_1 = exponent16(&st1);
961 { 887 setexponent16(&st1, 0);
962 /* There is a large exponent difference ( >= 64 ) */ 888 expdif -= N;
963 /* To make much sense, the code in this section should 889
964 be done at high precision. */ 890 sign = getsign(&tmp) ^ st1_sign;
965 int exp_1, N; 891 tag =
966 u_char sign; 892 FPU_u_div(&tmp, &st1, &tmp,
967 893 PR_64_BITS | RC_CHOP | 0x3f, sign);
968 /* prevent overflow here */ 894 setsign(&tmp, sign);
969 /* N is 'a number between 32 and 63' (p26-113) */ 895
970 reg_copy(&st0, &tmp); 896 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
971 tmptag = st0_tag; 897 overflow to 2^64 */
972 N = (expdif & 0x0000001f) + 32; /* This choice gives results 898
973 identical to an AMD 486 */ 899 rem_kernel(significand(&st0),
974 setexponent16(&tmp, N); 900 &significand(&tmp),
975 exp_1 = exponent16(&st1); 901 significand(&st1),
976 setexponent16(&st1, 0); 902 significand(&tmp), exponent(&tmp)
977 expdif -= N; 903 );
978 904 setexponent16(&tmp, exp_1 + expdif);
979 sign = getsign(&tmp) ^ st1_sign; 905
980 tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f, 906 /* It is possible for the operation to be complete here.
981 sign); 907 What does the IEEE standard say? The Intel 80486 manual
982 setsign(&tmp, sign); 908 implies that the operation will never be completed at this
983 909 point, and the behaviour of a real 80486 confirms this.
984 FPU_round_to_int(&tmp, tag); /* Fortunately, this can't 910 */
985 overflow to 2^64 */ 911 if (!(tmp.sigh | tmp.sigl)) {
986 912 /* The result is zero */
987 rem_kernel(significand(&st0), 913 control_word = old_cw;
988 &significand(&tmp), 914 partial_status = saved_status;
989 significand(&st1), 915 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
990 significand(&tmp), 916 setsign(&st0, st0_sign);
991 exponent(&tmp)
992 );
993 setexponent16(&tmp, exp_1 + expdif);
994
995 /* It is possible for the operation to be complete here.
996 What does the IEEE standard say? The Intel 80486 manual
997 implies that the operation will never be completed at this
998 point, and the behaviour of a real 80486 confirms this.
999 */
1000 if ( !(tmp.sigh | tmp.sigl) )
1001 {
1002 /* The result is zero */
1003 control_word = old_cw;
1004 partial_status = saved_status;
1005 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1006 setsign(&st0, st0_sign);
1007#ifdef PECULIAR_486 917#ifdef PECULIAR_486
1008 setcc(SW_C2); 918 setcc(SW_C2);
1009#else 919#else
1010 setcc(0); 920 setcc(0);
1011#endif /* PECULIAR_486 */ 921#endif /* PECULIAR_486 */
1012 return; 922 return;
1013 } 923 }
1014 cc = SW_C2; 924 cc = SW_C2;
1015 } 925 }
1016 926
1017 control_word = old_cw; 927 control_word = old_cw;
1018 partial_status = saved_status; 928 partial_status = saved_status;
1019 tag = FPU_normalize_nuo(&tmp); 929 tag = FPU_normalize_nuo(&tmp);
1020 reg_copy(&tmp, st0_ptr); 930 reg_copy(&tmp, st0_ptr);
1021 931
1022 /* The only condition to be looked for is underflow, 932 /* The only condition to be looked for is underflow,
1023 and it can occur here only if underflow is unmasked. */ 933 and it can occur here only if underflow is unmasked. */
1024 if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) 934 if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
1025 && !(control_word & CW_Underflow) ) 935 && !(control_word & CW_Underflow)) {
1026 { 936 setcc(cc);
1027 setcc(cc); 937 tag = arith_underflow(st0_ptr);
1028 tag = arith_underflow(st0_ptr); 938 setsign(st0_ptr, st0_sign);
1029 setsign(st0_ptr, st0_sign); 939 FPU_settag0(tag);
1030 FPU_settag0(tag); 940 return;
1031 return; 941 } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) {
1032 } 942 stdexp(st0_ptr);
1033 else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) ) 943 setsign(st0_ptr, st0_sign);
1034 { 944 } else {
1035 stdexp(st0_ptr); 945 tag =
1036 setsign(st0_ptr, st0_sign); 946 FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
1037 } 947 }
1038 else 948 FPU_settag0(tag);
1039 { 949 setcc(cc);
1040 tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
1041 }
1042 FPU_settag0(tag);
1043 setcc(cc);
1044 950
1045 return; 951 return;
1046 } 952 }
1047 953
1048 if ( st0_tag == TAG_Special ) 954 if (st0_tag == TAG_Special)
1049 st0_tag = FPU_Special(st0_ptr); 955 st0_tag = FPU_Special(st0_ptr);
1050 if ( st1_tag == TAG_Special ) 956 if (st1_tag == TAG_Special)
1051 st1_tag = FPU_Special(st1_ptr); 957 st1_tag = FPU_Special(st1_ptr);
1052 958
1053 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 959 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1054 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 960 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1055 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 961 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1056 { 962 if (denormal_operand() < 0)
1057 if ( denormal_operand() < 0 ) 963 return;
1058 return; 964 goto fprem_valid;
1059 goto fprem_valid; 965 } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1060 } 966 FPU_stack_underflow();
1061 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) 967 return;
1062 { 968 } else if (st0_tag == TAG_Zero) {
1063 FPU_stack_underflow(); 969 if (st1_tag == TAG_Valid) {
1064 return; 970 setcc(0);
1065 } 971 return;
1066 else if ( st0_tag == TAG_Zero ) 972 } else if (st1_tag == TW_Denormal) {
1067 { 973 if (denormal_operand() < 0)
1068 if ( st1_tag == TAG_Valid ) 974 return;
1069 { 975 setcc(0);
1070 setcc(0); return; 976 return;
1071 } 977 } else if (st1_tag == TAG_Zero) {
1072 else if ( st1_tag == TW_Denormal ) 978 arith_invalid(0);
1073 { 979 return;
1074 if ( denormal_operand() < 0 ) 980 } /* fprem(?,0) always invalid */
1075 return; 981 else if (st1_tag == TW_Infinity) {
1076 setcc(0); return; 982 setcc(0);
1077 } 983 return;
1078 else if ( st1_tag == TAG_Zero ) 984 }
1079 { arith_invalid(0); return; } /* fprem(?,0) always invalid */ 985 } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1080 else if ( st1_tag == TW_Infinity ) 986 if (st1_tag == TAG_Zero) {
1081 { setcc(0); return; } 987 arith_invalid(0); /* fprem(Valid,Zero) is invalid */
1082 } 988 return;
1083 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) 989 } else if (st1_tag != TW_NaN) {
1084 { 990 if (((st0_tag == TW_Denormal)
1085 if ( st1_tag == TAG_Zero ) 991 || (st1_tag == TW_Denormal))
1086 { 992 && (denormal_operand() < 0))
1087 arith_invalid(0); /* fprem(Valid,Zero) is invalid */ 993 return;
1088 return; 994
1089 } 995 if (st1_tag == TW_Infinity) {
1090 else if ( st1_tag != TW_NaN ) 996 /* fprem(Valid,Infinity) is o.k. */
1091 { 997 setcc(0);
1092 if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal)) 998 return;
1093 && (denormal_operand() < 0) ) 999 }
1094 return; 1000 }
1095 1001 } else if (st0_tag == TW_Infinity) {
1096 if ( st1_tag == TW_Infinity ) 1002 if (st1_tag != TW_NaN) {
1097 { 1003 arith_invalid(0); /* fprem(Infinity,?) is invalid */
1098 /* fprem(Valid,Infinity) is o.k. */ 1004 return;
1099 setcc(0); return; 1005 }
1100 }
1101 }
1102 }
1103 else if ( st0_tag == TW_Infinity )
1104 {
1105 if ( st1_tag != TW_NaN )
1106 {
1107 arith_invalid(0); /* fprem(Infinity,?) is invalid */
1108 return;
1109 } 1006 }
1110 }
1111 1007
1112 /* One of the registers must contain a NaN if we got here. */ 1008 /* One of the registers must contain a NaN if we got here. */
1113 1009
1114#ifdef PARANOID 1010#ifdef PARANOID
1115 if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) ) 1011 if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN))
1116 EXCEPTION(EX_INTERNAL | 0x118); 1012 EXCEPTION(EX_INTERNAL | 0x118);
1117#endif /* PARANOID */ 1013#endif /* PARANOID */
1118 1014
1119 real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); 1015 real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
1120 1016
1121} 1017}
1122 1018
1123
1124/* ST(1) <- ST(1) * log ST; pop ST */ 1019/* ST(1) <- ST(1) * log ST; pop ST */
1125static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag) 1020static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
1126{ 1021{
1127 FPU_REG *st1_ptr = &st(1), exponent; 1022 FPU_REG *st1_ptr = &st(1), exponent;
1128 u_char st1_tag = FPU_gettagi(1); 1023 u_char st1_tag = FPU_gettagi(1);
1129 u_char sign; 1024 u_char sign;
1130 int e, tag; 1025 int e, tag;
1131 1026
1132 clear_C1(); 1027 clear_C1();
1133 1028
1134 if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) ) 1029 if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) {
1135 { 1030 both_valid:
1136 both_valid: 1031 /* Both regs are Valid or Denormal */
1137 /* Both regs are Valid or Denormal */ 1032 if (signpositive(st0_ptr)) {
1138 if ( signpositive(st0_ptr) ) 1033 if (st0_tag == TW_Denormal)
1139 { 1034 FPU_to_exp16(st0_ptr, st0_ptr);
1140 if ( st0_tag == TW_Denormal ) 1035 else
1141 FPU_to_exp16(st0_ptr, st0_ptr); 1036 /* Convert st(0) for internal use. */
1142 else 1037 setexponent16(st0_ptr, exponent(st0_ptr));
1143 /* Convert st(0) for internal use. */ 1038
1144 setexponent16(st0_ptr, exponent(st0_ptr)); 1039 if ((st0_ptr->sigh == 0x80000000)
1145 1040 && (st0_ptr->sigl == 0)) {
1146 if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) ) 1041 /* Special case. The result can be precise. */
1147 { 1042 u_char esign;
1148 /* Special case. The result can be precise. */ 1043 e = exponent16(st0_ptr);
1149 u_char esign; 1044 if (e >= 0) {
1150 e = exponent16(st0_ptr); 1045 exponent.sigh = e;
1151 if ( e >= 0 ) 1046 esign = SIGN_POS;
1152 { 1047 } else {
1153 exponent.sigh = e; 1048 exponent.sigh = -e;
1154 esign = SIGN_POS; 1049 esign = SIGN_NEG;
1155 } 1050 }
1156 else 1051 exponent.sigl = 0;
1157 { 1052 setexponent16(&exponent, 31);
1158 exponent.sigh = -e; 1053 tag = FPU_normalize_nuo(&exponent);
1159 esign = SIGN_NEG; 1054 stdexp(&exponent);
1055 setsign(&exponent, esign);
1056 tag =
1057 FPU_mul(&exponent, tag, 1, FULL_PRECISION);
1058 if (tag >= 0)
1059 FPU_settagi(1, tag);
1060 } else {
1061 /* The usual case */
1062 sign = getsign(st1_ptr);
1063 if (st1_tag == TW_Denormal)
1064 FPU_to_exp16(st1_ptr, st1_ptr);
1065 else
1066 /* Convert st(1) for internal use. */
1067 setexponent16(st1_ptr,
1068 exponent(st1_ptr));
1069 poly_l2(st0_ptr, st1_ptr, sign);
1070 }
1071 } else {
1072 /* negative */
1073 if (arith_invalid(1) < 0)
1074 return;
1160 } 1075 }
1161 exponent.sigl = 0;
1162 setexponent16(&exponent, 31);
1163 tag = FPU_normalize_nuo(&exponent);
1164 stdexp(&exponent);
1165 setsign(&exponent, esign);
1166 tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION);
1167 if ( tag >= 0 )
1168 FPU_settagi(1, tag);
1169 }
1170 else
1171 {
1172 /* The usual case */
1173 sign = getsign(st1_ptr);
1174 if ( st1_tag == TW_Denormal )
1175 FPU_to_exp16(st1_ptr, st1_ptr);
1176 else
1177 /* Convert st(1) for internal use. */
1178 setexponent16(st1_ptr, exponent(st1_ptr));
1179 poly_l2(st0_ptr, st1_ptr, sign);
1180 }
1181 }
1182 else
1183 {
1184 /* negative */
1185 if ( arith_invalid(1) < 0 )
1186 return;
1187 }
1188 1076
1189 FPU_pop(); 1077 FPU_pop();
1190
1191 return;
1192 }
1193
1194 if ( st0_tag == TAG_Special )
1195 st0_tag = FPU_Special(st0_ptr);
1196 if ( st1_tag == TAG_Special )
1197 st1_tag = FPU_Special(st1_ptr);
1198
1199 if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
1200 {
1201 FPU_stack_underflow_pop(1);
1202 return;
1203 }
1204 else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) )
1205 {
1206 if ( st0_tag == TAG_Zero )
1207 {
1208 if ( st1_tag == TAG_Zero )
1209 {
1210 /* Both args zero is invalid */
1211 if ( arith_invalid(1) < 0 )
1212 return;
1213 }
1214 else
1215 {
1216 u_char sign;
1217 sign = getsign(st1_ptr)^SIGN_NEG;
1218 if ( FPU_divide_by_zero(1, sign) < 0 )
1219 return;
1220 1078
1221 setsign(st1_ptr, sign);
1222 }
1223 }
1224 else if ( st1_tag == TAG_Zero )
1225 {
1226 /* st(1) contains zero, st(0) valid <> 0 */
1227 /* Zero is the valid answer */
1228 sign = getsign(st1_ptr);
1229
1230 if ( signnegative(st0_ptr) )
1231 {
1232 /* log(negative) */
1233 if ( arith_invalid(1) < 0 )
1234 return; 1079 return;
1235 }
1236 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1237 return;
1238 else
1239 {
1240 if ( exponent(st0_ptr) < 0 )
1241 sign ^= SIGN_NEG;
1242
1243 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1244 setsign(st1_ptr, sign);
1245 }
1246 } 1080 }
1247 else
1248 {
1249 /* One or both operands are denormals. */
1250 if ( denormal_operand() < 0 )
1251 return;
1252 goto both_valid;
1253 }
1254 }
1255 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1256 {
1257 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1258 return;
1259 }
1260 /* One or both arg must be an infinity */
1261 else if ( st0_tag == TW_Infinity )
1262 {
1263 if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) )
1264 {
1265 /* log(-infinity) or 0*log(infinity) */
1266 if ( arith_invalid(1) < 0 )
1267 return;
1268 }
1269 else
1270 {
1271 u_char sign = getsign(st1_ptr);
1272 1081
1273 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1082 if (st0_tag == TAG_Special)
1274 return; 1083 st0_tag = FPU_Special(st0_ptr);
1084 if (st1_tag == TAG_Special)
1085 st1_tag = FPU_Special(st1_ptr);
1275 1086
1276 FPU_copy_to_reg1(&CONST_INF, TAG_Special); 1087 if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1277 setsign(st1_ptr, sign); 1088 FPU_stack_underflow_pop(1);
1278 }
1279 }
1280 /* st(1) must be infinity here */
1281 else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
1282 && ( signpositive(st0_ptr) ) )
1283 {
1284 if ( exponent(st0_ptr) >= 0 )
1285 {
1286 if ( (exponent(st0_ptr) == 0) &&
1287 (st0_ptr->sigh == 0x80000000) &&
1288 (st0_ptr->sigl == 0) )
1289 {
1290 /* st(0) holds 1.0 */
1291 /* infinity*log(1) */
1292 if ( arith_invalid(1) < 0 )
1293 return; 1089 return;
1294 } 1090 } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) {
1295 /* else st(0) is positive and > 1.0 */ 1091 if (st0_tag == TAG_Zero) {
1092 if (st1_tag == TAG_Zero) {
1093 /* Both args zero is invalid */
1094 if (arith_invalid(1) < 0)
1095 return;
1096 } else {
1097 u_char sign;
1098 sign = getsign(st1_ptr) ^ SIGN_NEG;
1099 if (FPU_divide_by_zero(1, sign) < 0)
1100 return;
1101
1102 setsign(st1_ptr, sign);
1103 }
1104 } else if (st1_tag == TAG_Zero) {
1105 /* st(1) contains zero, st(0) valid <> 0 */
1106 /* Zero is the valid answer */
1107 sign = getsign(st1_ptr);
1108
1109 if (signnegative(st0_ptr)) {
1110 /* log(negative) */
1111 if (arith_invalid(1) < 0)
1112 return;
1113 } else if ((st0_tag == TW_Denormal)
1114 && (denormal_operand() < 0))
1115 return;
1116 else {
1117 if (exponent(st0_ptr) < 0)
1118 sign ^= SIGN_NEG;
1119
1120 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1121 setsign(st1_ptr, sign);
1122 }
1123 } else {
1124 /* One or both operands are denormals. */
1125 if (denormal_operand() < 0)
1126 return;
1127 goto both_valid;
1128 }
1129 } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
1130 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1131 return;
1132 }
1133 /* One or both arg must be an infinity */
1134 else if (st0_tag == TW_Infinity) {
1135 if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) {
1136 /* log(-infinity) or 0*log(infinity) */
1137 if (arith_invalid(1) < 0)
1138 return;
1139 } else {
1140 u_char sign = getsign(st1_ptr);
1141
1142 if ((st1_tag == TW_Denormal)
1143 && (denormal_operand() < 0))
1144 return;
1145
1146 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1147 setsign(st1_ptr, sign);
1148 }
1296 } 1149 }
1297 else 1150 /* st(1) must be infinity here */
1298 { 1151 else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
1299 /* st(0) is positive and < 1.0 */ 1152 && (signpositive(st0_ptr))) {
1153 if (exponent(st0_ptr) >= 0) {
1154 if ((exponent(st0_ptr) == 0) &&
1155 (st0_ptr->sigh == 0x80000000) &&
1156 (st0_ptr->sigl == 0)) {
1157 /* st(0) holds 1.0 */
1158 /* infinity*log(1) */
1159 if (arith_invalid(1) < 0)
1160 return;
1161 }
1162 /* else st(0) is positive and > 1.0 */
1163 } else {
1164 /* st(0) is positive and < 1.0 */
1300 1165
1301 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1166 if ((st0_tag == TW_Denormal)
1302 return; 1167 && (denormal_operand() < 0))
1168 return;
1303 1169
1304 changesign(st1_ptr); 1170 changesign(st1_ptr);
1305 } 1171 }
1306 } 1172 } else {
1307 else 1173 /* st(0) must be zero or negative */
1308 { 1174 if (st0_tag == TAG_Zero) {
1309 /* st(0) must be zero or negative */ 1175 /* This should be invalid, but a real 80486 is happy with it. */
1310 if ( st0_tag == TAG_Zero )
1311 {
1312 /* This should be invalid, but a real 80486 is happy with it. */
1313 1176
1314#ifndef PECULIAR_486 1177#ifndef PECULIAR_486
1315 sign = getsign(st1_ptr); 1178 sign = getsign(st1_ptr);
1316 if ( FPU_divide_by_zero(1, sign) < 0 ) 1179 if (FPU_divide_by_zero(1, sign) < 0)
1317 return; 1180 return;
1318#endif /* PECULIAR_486 */ 1181#endif /* PECULIAR_486 */
1319 1182
1320 changesign(st1_ptr); 1183 changesign(st1_ptr);
1184 } else if (arith_invalid(1) < 0) /* log(negative) */
1185 return;
1321 } 1186 }
1322 else if ( arith_invalid(1) < 0 ) /* log(negative) */
1323 return;
1324 }
1325 1187
1326 FPU_pop(); 1188 FPU_pop();
1327} 1189}
1328 1190
1329
1330static void fpatan(FPU_REG *st0_ptr, u_char st0_tag) 1191static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
1331{ 1192{
1332 FPU_REG *st1_ptr = &st(1); 1193 FPU_REG *st1_ptr = &st(1);
1333 u_char st1_tag = FPU_gettagi(1); 1194 u_char st1_tag = FPU_gettagi(1);
1334 int tag; 1195 int tag;
1335 1196
1336 clear_C1(); 1197 clear_C1();
1337 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1198 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1338 { 1199 valid_atan:
1339 valid_atan:
1340 1200
1341 poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); 1201 poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
1342 1202
1343 FPU_pop(); 1203 FPU_pop();
1344 1204
1345 return; 1205 return;
1346 } 1206 }
1347 1207
1348 if ( st0_tag == TAG_Special ) 1208 if (st0_tag == TAG_Special)
1349 st0_tag = FPU_Special(st0_ptr); 1209 st0_tag = FPU_Special(st0_ptr);
1350 if ( st1_tag == TAG_Special ) 1210 if (st1_tag == TAG_Special)
1351 st1_tag = FPU_Special(st1_ptr); 1211 st1_tag = FPU_Special(st1_ptr);
1352 1212
1353 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 1213 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1354 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 1214 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1355 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 1215 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1356 { 1216 if (denormal_operand() < 0)
1357 if ( denormal_operand() < 0 ) 1217 return;
1358 return;
1359 1218
1360 goto valid_atan; 1219 goto valid_atan;
1361 } 1220 } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
1362 else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) 1221 FPU_stack_underflow_pop(1);
1363 { 1222 return;
1364 FPU_stack_underflow_pop(1); 1223 } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
1365 return; 1224 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0)
1366 } 1225 FPU_pop();
1367 else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
1368 {
1369 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 )
1370 FPU_pop();
1371 return;
1372 }
1373 else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) )
1374 {
1375 u_char sign = getsign(st1_ptr);
1376 if ( st0_tag == TW_Infinity )
1377 {
1378 if ( st1_tag == TW_Infinity )
1379 {
1380 if ( signpositive(st0_ptr) )
1381 {
1382 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
1383 }
1384 else
1385 {
1386 setpositive(st1_ptr);
1387 tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr,
1388 FULL_PRECISION, SIGN_POS,
1389 exponent(&CONST_PI4), exponent(&CONST_PI2));
1390 if ( tag >= 0 )
1391 FPU_settagi(1, tag);
1392 }
1393 }
1394 else
1395 {
1396 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1397 return; 1226 return;
1227 } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) {
1228 u_char sign = getsign(st1_ptr);
1229 if (st0_tag == TW_Infinity) {
1230 if (st1_tag == TW_Infinity) {
1231 if (signpositive(st0_ptr)) {
1232 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
1233 } else {
1234 setpositive(st1_ptr);
1235 tag =
1236 FPU_u_add(&CONST_PI4, &CONST_PI2,
1237 st1_ptr, FULL_PRECISION,
1238 SIGN_POS,
1239 exponent(&CONST_PI4),
1240 exponent(&CONST_PI2));
1241 if (tag >= 0)
1242 FPU_settagi(1, tag);
1243 }
1244 } else {
1245 if ((st1_tag == TW_Denormal)
1246 && (denormal_operand() < 0))
1247 return;
1248
1249 if (signpositive(st0_ptr)) {
1250 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1251 setsign(st1_ptr, sign); /* An 80486 preserves the sign */
1252 FPU_pop();
1253 return;
1254 } else {
1255 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1256 }
1257 }
1258 } else {
1259 /* st(1) is infinity, st(0) not infinity */
1260 if ((st0_tag == TW_Denormal)
1261 && (denormal_operand() < 0))
1262 return;
1398 1263
1399 if ( signpositive(st0_ptr) ) 1264 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1400 {
1401 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
1402 setsign(st1_ptr, sign); /* An 80486 preserves the sign */
1403 FPU_pop();
1404 return;
1405 } 1265 }
1406 else 1266 setsign(st1_ptr, sign);
1407 { 1267 } else if (st1_tag == TAG_Zero) {
1408 FPU_copy_to_reg1(&CONST_PI, TAG_Valid); 1268 /* st(0) must be valid or zero */
1269 u_char sign = getsign(st1_ptr);
1270
1271 if ((st0_tag == TW_Denormal) && (denormal_operand() < 0))
1272 return;
1273
1274 if (signpositive(st0_ptr)) {
1275 /* An 80486 preserves the sign */
1276 FPU_pop();
1277 return;
1409 } 1278 }
1410 }
1411 }
1412 else
1413 {
1414 /* st(1) is infinity, st(0) not infinity */
1415 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1416 return;
1417 1279
1418 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); 1280 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
1419 } 1281 setsign(st1_ptr, sign);
1420 setsign(st1_ptr, sign); 1282 } else if (st0_tag == TAG_Zero) {
1421 } 1283 /* st(1) must be TAG_Valid here */
1422 else if ( st1_tag == TAG_Zero ) 1284 u_char sign = getsign(st1_ptr);
1423 {
1424 /* st(0) must be valid or zero */
1425 u_char sign = getsign(st1_ptr);
1426
1427 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1428 return;
1429 1285
1430 if ( signpositive(st0_ptr) ) 1286 if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
1431 { 1287 return;
1432 /* An 80486 preserves the sign */
1433 FPU_pop();
1434 return;
1435 }
1436 1288
1437 FPU_copy_to_reg1(&CONST_PI, TAG_Valid); 1289 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1438 setsign(st1_ptr, sign); 1290 setsign(st1_ptr, sign);
1439 } 1291 }
1440 else if ( st0_tag == TAG_Zero )
1441 {
1442 /* st(1) must be TAG_Valid here */
1443 u_char sign = getsign(st1_ptr);
1444
1445 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
1446 return;
1447
1448 FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
1449 setsign(st1_ptr, sign);
1450 }
1451#ifdef PARANOID 1292#ifdef PARANOID
1452 else 1293 else
1453 EXCEPTION(EX_INTERNAL | 0x125); 1294 EXCEPTION(EX_INTERNAL | 0x125);
1454#endif /* PARANOID */ 1295#endif /* PARANOID */
1455 1296
1456 FPU_pop(); 1297 FPU_pop();
1457 set_precision_flag_up(); /* We do not really know if up or down */ 1298 set_precision_flag_up(); /* We do not really know if up or down */
1458} 1299}
1459 1300
1460
1461static void fprem(FPU_REG *st0_ptr, u_char st0_tag) 1301static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
1462{ 1302{
1463 do_fprem(st0_ptr, st0_tag, RC_CHOP); 1303 do_fprem(st0_ptr, st0_tag, RC_CHOP);
1464} 1304}
1465 1305
1466
1467static void fprem1(FPU_REG *st0_ptr, u_char st0_tag) 1306static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
1468{ 1307{
1469 do_fprem(st0_ptr, st0_tag, RC_RND); 1308 do_fprem(st0_ptr, st0_tag, RC_RND);
1470} 1309}
1471 1310
1472
1473static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) 1311static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
1474{ 1312{
1475 u_char sign, sign1; 1313 u_char sign, sign1;
1476 FPU_REG *st1_ptr = &st(1), a, b; 1314 FPU_REG *st1_ptr = &st(1), a, b;
1477 u_char st1_tag = FPU_gettagi(1); 1315 u_char st1_tag = FPU_gettagi(1);
1478 1316
1479 clear_C1(); 1317 clear_C1();
1480 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1318 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1481 { 1319 valid_yl2xp1:
1482 valid_yl2xp1:
1483 1320
1484 sign = getsign(st0_ptr); 1321 sign = getsign(st0_ptr);
1485 sign1 = getsign(st1_ptr); 1322 sign1 = getsign(st1_ptr);
1486 1323
1487 FPU_to_exp16(st0_ptr, &a); 1324 FPU_to_exp16(st0_ptr, &a);
1488 FPU_to_exp16(st1_ptr, &b); 1325 FPU_to_exp16(st1_ptr, &b);
1489 1326
1490 if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) ) 1327 if (poly_l2p1(sign, sign1, &a, &b, st1_ptr))
1491 return; 1328 return;
1492 1329
1493 FPU_pop(); 1330 FPU_pop();
1494 return; 1331 return;
1495 } 1332 }
1496 1333
1497 if ( st0_tag == TAG_Special ) 1334 if (st0_tag == TAG_Special)
1498 st0_tag = FPU_Special(st0_ptr); 1335 st0_tag = FPU_Special(st0_ptr);
1499 if ( st1_tag == TAG_Special ) 1336 if (st1_tag == TAG_Special)
1500 st1_tag = FPU_Special(st1_ptr); 1337 st1_tag = FPU_Special(st1_ptr);
1501 1338
1502 if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) 1339 if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
1503 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) 1340 || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
1504 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) 1341 || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
1505 { 1342 if (denormal_operand() < 0)
1506 if ( denormal_operand() < 0 ) 1343 return;
1507 return;
1508
1509 goto valid_yl2xp1;
1510 }
1511 else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) )
1512 {
1513 FPU_stack_underflow_pop(1);
1514 return;
1515 }
1516 else if ( st0_tag == TAG_Zero )
1517 {
1518 switch ( st1_tag )
1519 {
1520 case TW_Denormal:
1521 if ( denormal_operand() < 0 )
1522 return;
1523
1524 case TAG_Zero:
1525 case TAG_Valid:
1526 setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
1527 FPU_copy_to_reg1(st0_ptr, st0_tag);
1528 break;
1529
1530 case TW_Infinity:
1531 /* Infinity*log(1) */
1532 if ( arith_invalid(1) < 0 )
1533 return;
1534 break;
1535 1344
1536 case TW_NaN: 1345 goto valid_yl2xp1;
1537 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) 1346 } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) {
1538 return; 1347 FPU_stack_underflow_pop(1);
1539 break; 1348 return;
1540 1349 } else if (st0_tag == TAG_Zero) {
1541 default: 1350 switch (st1_tag) {
1351 case TW_Denormal:
1352 if (denormal_operand() < 0)
1353 return;
1354
1355 case TAG_Zero:
1356 case TAG_Valid:
1357 setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
1358 FPU_copy_to_reg1(st0_ptr, st0_tag);
1359 break;
1360
1361 case TW_Infinity:
1362 /* Infinity*log(1) */
1363 if (arith_invalid(1) < 0)
1364 return;
1365 break;
1366
1367 case TW_NaN:
1368 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1369 return;
1370 break;
1371
1372 default:
1542#ifdef PARANOID 1373#ifdef PARANOID
1543 EXCEPTION(EX_INTERNAL | 0x116); 1374 EXCEPTION(EX_INTERNAL | 0x116);
1544 return; 1375 return;
1545#endif /* PARANOID */ 1376#endif /* PARANOID */
1546 break; 1377 break;
1547 } 1378 }
1548 } 1379 } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1549 else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) 1380 switch (st1_tag) {
1550 { 1381 case TAG_Zero:
1551 switch ( st1_tag ) 1382 if (signnegative(st0_ptr)) {
1552 { 1383 if (exponent(st0_ptr) >= 0) {
1553 case TAG_Zero: 1384 /* st(0) holds <= -1.0 */
1554 if ( signnegative(st0_ptr) ) 1385#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1555 { 1386 changesign(st1_ptr);
1556 if ( exponent(st0_ptr) >= 0 )
1557 {
1558 /* st(0) holds <= -1.0 */
1559#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1560 changesign(st1_ptr);
1561#else 1387#else
1562 if ( arith_invalid(1) < 0 ) 1388 if (arith_invalid(1) < 0)
1563 return; 1389 return;
1564#endif /* PECULIAR_486 */ 1390#endif /* PECULIAR_486 */
1565 } 1391 } else if ((st0_tag == TW_Denormal)
1566 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1392 && (denormal_operand() < 0))
1567 return; 1393 return;
1568 else 1394 else
1569 changesign(st1_ptr); 1395 changesign(st1_ptr);
1570 } 1396 } else if ((st0_tag == TW_Denormal)
1571 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) 1397 && (denormal_operand() < 0))
1572 return; 1398 return;
1573 break; 1399 break;
1574 1400
1575 case TW_Infinity: 1401 case TW_Infinity:
1576 if ( signnegative(st0_ptr) ) 1402 if (signnegative(st0_ptr)) {
1577 { 1403 if ((exponent(st0_ptr) >= 0) &&
1578 if ( (exponent(st0_ptr) >= 0) && 1404 !((st0_ptr->sigh == 0x80000000) &&
1579 !((st0_ptr->sigh == 0x80000000) && 1405 (st0_ptr->sigl == 0))) {
1580 (st0_ptr->sigl == 0)) ) 1406 /* st(0) holds < -1.0 */
1581 { 1407#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1582 /* st(0) holds < -1.0 */ 1408 changesign(st1_ptr);
1583#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
1584 changesign(st1_ptr);
1585#else 1409#else
1586 if ( arith_invalid(1) < 0 ) return; 1410 if (arith_invalid(1) < 0)
1411 return;
1587#endif /* PECULIAR_486 */ 1412#endif /* PECULIAR_486 */
1413 } else if ((st0_tag == TW_Denormal)
1414 && (denormal_operand() < 0))
1415 return;
1416 else
1417 changesign(st1_ptr);
1418 } else if ((st0_tag == TW_Denormal)
1419 && (denormal_operand() < 0))
1420 return;
1421 break;
1422
1423 case TW_NaN:
1424 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1425 return;
1588 } 1426 }
1589 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1590 return;
1591 else
1592 changesign(st1_ptr);
1593 }
1594 else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1595 return;
1596 break;
1597
1598 case TW_NaN:
1599 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1600 return;
1601 }
1602 1427
1603 } 1428 } else if (st0_tag == TW_NaN) {
1604 else if ( st0_tag == TW_NaN ) 1429 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1605 { 1430 return;
1606 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) 1431 } else if (st0_tag == TW_Infinity) {
1607 return; 1432 if (st1_tag == TW_NaN) {
1608 } 1433 if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
1609 else if ( st0_tag == TW_Infinity ) 1434 return;
1610 { 1435 } else if (signnegative(st0_ptr)) {
1611 if ( st1_tag == TW_NaN )
1612 {
1613 if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
1614 return;
1615 }
1616 else if ( signnegative(st0_ptr) )
1617 {
1618#ifndef PECULIAR_486 1436#ifndef PECULIAR_486
1619 /* This should have higher priority than denormals, but... */ 1437 /* This should have higher priority than denormals, but... */
1620 if ( arith_invalid(1) < 0 ) /* log(-infinity) */ 1438 if (arith_invalid(1) < 0) /* log(-infinity) */
1621 return; 1439 return;
1622#endif /* PECULIAR_486 */ 1440#endif /* PECULIAR_486 */
1623 if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1441 if ((st1_tag == TW_Denormal)
1624 return; 1442 && (denormal_operand() < 0))
1443 return;
1625#ifdef PECULIAR_486 1444#ifdef PECULIAR_486
1626 /* Denormal operands actually get higher priority */ 1445 /* Denormal operands actually get higher priority */
1627 if ( arith_invalid(1) < 0 ) /* log(-infinity) */ 1446 if (arith_invalid(1) < 0) /* log(-infinity) */
1628 return; 1447 return;
1629#endif /* PECULIAR_486 */ 1448#endif /* PECULIAR_486 */
1630 } 1449 } else if (st1_tag == TAG_Zero) {
1631 else if ( st1_tag == TAG_Zero ) 1450 /* log(infinity) */
1632 { 1451 if (arith_invalid(1) < 0)
1633 /* log(infinity) */ 1452 return;
1634 if ( arith_invalid(1) < 0 ) 1453 }
1635 return;
1636 }
1637
1638 /* st(1) must be valid here. */
1639 1454
1640 else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) 1455 /* st(1) must be valid here. */
1641 return; 1456
1457 else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
1458 return;
1642 1459
1643 /* The Manual says that log(Infinity) is invalid, but a real 1460 /* The Manual says that log(Infinity) is invalid, but a real
1644 80486 sensibly says that it is o.k. */ 1461 80486 sensibly says that it is o.k. */
1645 else 1462 else {
1646 { 1463 u_char sign = getsign(st1_ptr);
1647 u_char sign = getsign(st1_ptr); 1464 FPU_copy_to_reg1(&CONST_INF, TAG_Special);
1648 FPU_copy_to_reg1(&CONST_INF, TAG_Special); 1465 setsign(st1_ptr, sign);
1649 setsign(st1_ptr, sign); 1466 }
1650 } 1467 }
1651 }
1652#ifdef PARANOID 1468#ifdef PARANOID
1653 else 1469 else {
1654 { 1470 EXCEPTION(EX_INTERNAL | 0x117);
1655 EXCEPTION(EX_INTERNAL | 0x117); 1471 return;
1656 return; 1472 }
1657 }
1658#endif /* PARANOID */ 1473#endif /* PARANOID */
1659 1474
1660 FPU_pop(); 1475 FPU_pop();
1661 return; 1476 return;
1662 1477
1663} 1478}
1664 1479
1665
1666static void fscale(FPU_REG *st0_ptr, u_char st0_tag) 1480static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
1667{ 1481{
1668 FPU_REG *st1_ptr = &st(1); 1482 FPU_REG *st1_ptr = &st(1);
1669 u_char st1_tag = FPU_gettagi(1); 1483 u_char st1_tag = FPU_gettagi(1);
1670 int old_cw = control_word; 1484 int old_cw = control_word;
1671 u_char sign = getsign(st0_ptr); 1485 u_char sign = getsign(st0_ptr);
1672 1486
1673 clear_C1(); 1487 clear_C1();
1674 if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) 1488 if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
1675 { 1489 long scale;
1676 long scale; 1490 FPU_REG tmp;
1677 FPU_REG tmp; 1491
1678 1492 /* Convert register for internal use. */
1679 /* Convert register for internal use. */ 1493 setexponent16(st0_ptr, exponent(st0_ptr));
1680 setexponent16(st0_ptr, exponent(st0_ptr)); 1494
1681 1495 valid_scale:
1682 valid_scale: 1496
1683 1497 if (exponent(st1_ptr) > 30) {
1684 if ( exponent(st1_ptr) > 30 ) 1498 /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
1685 { 1499
1686 /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ 1500 if (signpositive(st1_ptr)) {
1687 1501 EXCEPTION(EX_Overflow);
1688 if ( signpositive(st1_ptr) ) 1502 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1689 { 1503 } else {
1690 EXCEPTION(EX_Overflow); 1504 EXCEPTION(EX_Underflow);
1691 FPU_copy_to_reg0(&CONST_INF, TAG_Special); 1505 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1692 } 1506 }
1693 else 1507 setsign(st0_ptr, sign);
1694 { 1508 return;
1695 EXCEPTION(EX_Underflow); 1509 }
1696 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1697 }
1698 setsign(st0_ptr, sign);
1699 return;
1700 }
1701
1702 control_word &= ~CW_RC;
1703 control_word |= RC_CHOP;
1704 reg_copy(st1_ptr, &tmp);
1705 FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
1706 control_word = old_cw;
1707 scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
1708 scale += exponent16(st0_ptr);
1709
1710 setexponent16(st0_ptr, scale);
1711
1712 /* Use FPU_round() to properly detect under/overflow etc */
1713 FPU_round(st0_ptr, 0, 0, control_word, sign);
1714
1715 return;
1716 }
1717
1718 if ( st0_tag == TAG_Special )
1719 st0_tag = FPU_Special(st0_ptr);
1720 if ( st1_tag == TAG_Special )
1721 st1_tag = FPU_Special(st1_ptr);
1722
1723 if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
1724 {
1725 switch ( st1_tag )
1726 {
1727 case TAG_Valid:
1728 /* st(0) must be a denormal */
1729 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1730 return;
1731
1732 FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
1733 goto valid_scale;
1734
1735 case TAG_Zero:
1736 if ( st0_tag == TW_Denormal )
1737 denormal_operand();
1738 return;
1739
1740 case TW_Denormal:
1741 denormal_operand();
1742 return;
1743
1744 case TW_Infinity:
1745 if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
1746 return;
1747
1748 if ( signpositive(st1_ptr) )
1749 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1750 else
1751 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1752 setsign(st0_ptr, sign);
1753 return;
1754 1510
1755 case TW_NaN: 1511 control_word &= ~CW_RC;
1756 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); 1512 control_word |= RC_CHOP;
1757 return; 1513 reg_copy(st1_ptr, &tmp);
1758 } 1514 FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
1759 } 1515 control_word = old_cw;
1760 else if ( st0_tag == TAG_Zero ) 1516 scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
1761 { 1517 scale += exponent16(st0_ptr);
1762 switch ( st1_tag )
1763 {
1764 case TAG_Valid:
1765 case TAG_Zero:
1766 return;
1767 1518
1768 case TW_Denormal: 1519 setexponent16(st0_ptr, scale);
1769 denormal_operand();
1770 return;
1771 1520
1772 case TW_Infinity: 1521 /* Use FPU_round() to properly detect under/overflow etc */
1773 if ( signpositive(st1_ptr) ) 1522 FPU_round(st0_ptr, 0, 0, control_word, sign);
1774 arith_invalid(0); /* Zero scaled by +Infinity */
1775 return;
1776 1523
1777 case TW_NaN: 1524 return;
1778 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1779 return;
1780 } 1525 }
1781 }
1782 else if ( st0_tag == TW_Infinity )
1783 {
1784 switch ( st1_tag )
1785 {
1786 case TAG_Valid:
1787 case TAG_Zero:
1788 return;
1789
1790 case TW_Denormal:
1791 denormal_operand();
1792 return;
1793 1526
1794 case TW_Infinity: 1527 if (st0_tag == TAG_Special)
1795 if ( signnegative(st1_ptr) ) 1528 st0_tag = FPU_Special(st0_ptr);
1796 arith_invalid(0); /* Infinity scaled by -Infinity */ 1529 if (st1_tag == TAG_Special)
1797 return; 1530 st1_tag = FPU_Special(st1_ptr);
1798 1531
1799 case TW_NaN: 1532 if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
1800 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); 1533 switch (st1_tag) {
1801 return; 1534 case TAG_Valid:
1535 /* st(0) must be a denormal */
1536 if ((st0_tag == TW_Denormal)
1537 && (denormal_operand() < 0))
1538 return;
1539
1540 FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
1541 goto valid_scale;
1542
1543 case TAG_Zero:
1544 if (st0_tag == TW_Denormal)
1545 denormal_operand();
1546 return;
1547
1548 case TW_Denormal:
1549 denormal_operand();
1550 return;
1551
1552 case TW_Infinity:
1553 if ((st0_tag == TW_Denormal)
1554 && (denormal_operand() < 0))
1555 return;
1556
1557 if (signpositive(st1_ptr))
1558 FPU_copy_to_reg0(&CONST_INF, TAG_Special);
1559 else
1560 FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
1561 setsign(st0_ptr, sign);
1562 return;
1563
1564 case TW_NaN:
1565 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1566 return;
1567 }
1568 } else if (st0_tag == TAG_Zero) {
1569 switch (st1_tag) {
1570 case TAG_Valid:
1571 case TAG_Zero:
1572 return;
1573
1574 case TW_Denormal:
1575 denormal_operand();
1576 return;
1577
1578 case TW_Infinity:
1579 if (signpositive(st1_ptr))
1580 arith_invalid(0); /* Zero scaled by +Infinity */
1581 return;
1582
1583 case TW_NaN:
1584 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1585 return;
1586 }
1587 } else if (st0_tag == TW_Infinity) {
1588 switch (st1_tag) {
1589 case TAG_Valid:
1590 case TAG_Zero:
1591 return;
1592
1593 case TW_Denormal:
1594 denormal_operand();
1595 return;
1596
1597 case TW_Infinity:
1598 if (signnegative(st1_ptr))
1599 arith_invalid(0); /* Infinity scaled by -Infinity */
1600 return;
1601
1602 case TW_NaN:
1603 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1604 return;
1605 }
1606 } else if (st0_tag == TW_NaN) {
1607 if (st1_tag != TAG_Empty) {
1608 real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
1609 return;
1610 }
1802 } 1611 }
1803 }
1804 else if ( st0_tag == TW_NaN )
1805 {
1806 if ( st1_tag != TAG_Empty )
1807 { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; }
1808 }
1809
1810#ifdef PARANOID 1612#ifdef PARANOID
1811 if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) ) 1613 if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) {
1812 { 1614 EXCEPTION(EX_INTERNAL | 0x115);
1813 EXCEPTION(EX_INTERNAL | 0x115); 1615 return;
1814 return; 1616 }
1815 }
1816#endif 1617#endif
1817 1618
1818 /* At least one of st(0), st(1) must be empty */ 1619 /* At least one of st(0), st(1) must be empty */
1819 FPU_stack_underflow(); 1620 FPU_stack_underflow();
1820 1621
1821} 1622}
1822 1623
1823
1824/*---------------------------------------------------------------------------*/ 1624/*---------------------------------------------------------------------------*/
1825 1625
1826static FUNC_ST0 const trig_table_a[] = { 1626static FUNC_ST0 const trig_table_a[] = {
1827 f2xm1, fyl2x, fptan, fpatan, 1627 f2xm1, fyl2x, fptan, fpatan,
1828 fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp 1628 fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
1829}; 1629};
1830 1630
1831void FPU_triga(void) 1631void FPU_triga(void)
1832{ 1632{
1833 (trig_table_a[FPU_rm])(&st(0), FPU_gettag0()); 1633 (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0());
1834} 1634}
1835 1635
1836 1636static FUNC_ST0 const trig_table_b[] = {
1837static FUNC_ST0 const trig_table_b[] = 1637 fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
1838 { 1638};
1839 fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos
1840 };
1841 1639
1842void FPU_trigb(void) 1640void FPU_trigb(void)
1843{ 1641{
1844 (trig_table_b[FPU_rm])(&st(0), FPU_gettag0()); 1642 (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0());
1845} 1643}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 2e2c51a8bd3a..d701e2b39e44 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -17,7 +17,6 @@
17 | other processes using the emulator while swapping is in progress. | 17 | other processes using the emulator while swapping is in progress. |
18 +---------------------------------------------------------------------------*/ 18 +---------------------------------------------------------------------------*/
19 19
20
21#include <linux/stddef.h> 20#include <linux/stddef.h>
22 21
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -27,31 +26,30 @@
27#include "exception.h" 26#include "exception.h"
28#include "fpu_emu.h" 27#include "fpu_emu.h"
29 28
30
31#define FPU_WRITE_BIT 0x10 29#define FPU_WRITE_BIT 0x10
32 30
33static int reg_offset[] = { 31static int reg_offset[] = {
34 offsetof(struct info,___eax), 32 offsetof(struct info, ___eax),
35 offsetof(struct info,___ecx), 33 offsetof(struct info, ___ecx),
36 offsetof(struct info,___edx), 34 offsetof(struct info, ___edx),
37 offsetof(struct info,___ebx), 35 offsetof(struct info, ___ebx),
38 offsetof(struct info,___esp), 36 offsetof(struct info, ___esp),
39 offsetof(struct info,___ebp), 37 offsetof(struct info, ___ebp),
40 offsetof(struct info,___esi), 38 offsetof(struct info, ___esi),
41 offsetof(struct info,___edi) 39 offsetof(struct info, ___edi)
42}; 40};
43 41
44#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) 42#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
45 43
46static int reg_offset_vm86[] = { 44static int reg_offset_vm86[] = {
47 offsetof(struct info,___cs), 45 offsetof(struct info, ___cs),
48 offsetof(struct info,___vm86_ds), 46 offsetof(struct info, ___vm86_ds),
49 offsetof(struct info,___vm86_es), 47 offsetof(struct info, ___vm86_es),
50 offsetof(struct info,___vm86_fs), 48 offsetof(struct info, ___vm86_fs),
51 offsetof(struct info,___vm86_gs), 49 offsetof(struct info, ___vm86_gs),
52 offsetof(struct info,___ss), 50 offsetof(struct info, ___ss),
53 offsetof(struct info,___vm86_ds) 51 offsetof(struct info, ___vm86_ds)
54 }; 52};
55 53
56#define VM86_REG_(x) (*(unsigned short *) \ 54#define VM86_REG_(x) (*(unsigned short *) \
57 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) 55 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
@@ -60,158 +58,141 @@ static int reg_offset_vm86[] = {
60#define ___GS ___ds 58#define ___GS ___ds
61 59
62static int reg_offset_pm[] = { 60static int reg_offset_pm[] = {
63 offsetof(struct info,___cs), 61 offsetof(struct info, ___cs),
64 offsetof(struct info,___ds), 62 offsetof(struct info, ___ds),
65 offsetof(struct info,___es), 63 offsetof(struct info, ___es),
66 offsetof(struct info,___fs), 64 offsetof(struct info, ___fs),
67 offsetof(struct info,___GS), 65 offsetof(struct info, ___GS),
68 offsetof(struct info,___ss), 66 offsetof(struct info, ___ss),
69 offsetof(struct info,___ds) 67 offsetof(struct info, ___ds)
70 }; 68};
71 69
72#define PM_REG_(x) (*(unsigned short *) \ 70#define PM_REG_(x) (*(unsigned short *) \
73 (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) 71 (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
74 72
75
76/* Decode the SIB byte. This function assumes mod != 0 */ 73/* Decode the SIB byte. This function assumes mod != 0 */
77static int sib(int mod, unsigned long *fpu_eip) 74static int sib(int mod, unsigned long *fpu_eip)
78{ 75{
79 u_char ss,index,base; 76 u_char ss, index, base;
80 long offset; 77 long offset;
81 78
82 RE_ENTRANT_CHECK_OFF; 79 RE_ENTRANT_CHECK_OFF;
83 FPU_code_access_ok(1); 80 FPU_code_access_ok(1);
84 FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ 81 FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */
85 RE_ENTRANT_CHECK_ON; 82 RE_ENTRANT_CHECK_ON;
86 (*fpu_eip)++; 83 (*fpu_eip)++;
87 ss = base >> 6; 84 ss = base >> 6;
88 index = (base >> 3) & 7; 85 index = (base >> 3) & 7;
89 base &= 7; 86 base &= 7;
90 87
91 if ((mod == 0) && (base == 5)) 88 if ((mod == 0) && (base == 5))
92 offset = 0; /* No base register */ 89 offset = 0; /* No base register */
93 else 90 else
94 offset = REG_(base); 91 offset = REG_(base);
95 92
96 if (index == 4) 93 if (index == 4) {
97 { 94 /* No index register */
98 /* No index register */ 95 /* A non-zero ss is illegal */
99 /* A non-zero ss is illegal */ 96 if (ss)
100 if ( ss ) 97 EXCEPTION(EX_Invalid);
101 EXCEPTION(EX_Invalid); 98 } else {
102 } 99 offset += (REG_(index)) << ss;
103 else 100 }
104 { 101
105 offset += (REG_(index)) << ss; 102 if (mod == 1) {
106 } 103 /* 8 bit signed displacement */
107 104 long displacement;
108 if (mod == 1) 105 RE_ENTRANT_CHECK_OFF;
109 { 106 FPU_code_access_ok(1);
110 /* 8 bit signed displacement */ 107 FPU_get_user(displacement, (signed char __user *)(*fpu_eip));
111 long displacement; 108 offset += displacement;
112 RE_ENTRANT_CHECK_OFF; 109 RE_ENTRANT_CHECK_ON;
113 FPU_code_access_ok(1); 110 (*fpu_eip)++;
114 FPU_get_user(displacement, (signed char __user *) (*fpu_eip)); 111 } else if (mod == 2 || base == 5) { /* The second condition also has mod==0 */
115 offset += displacement; 112 /* 32 bit displacement */
116 RE_ENTRANT_CHECK_ON; 113 long displacement;
117 (*fpu_eip)++; 114 RE_ENTRANT_CHECK_OFF;
118 } 115 FPU_code_access_ok(4);
119 else if (mod == 2 || base == 5) /* The second condition also has mod==0 */ 116 FPU_get_user(displacement, (long __user *)(*fpu_eip));
120 { 117 offset += displacement;
121 /* 32 bit displacement */ 118 RE_ENTRANT_CHECK_ON;
122 long displacement; 119 (*fpu_eip) += 4;
123 RE_ENTRANT_CHECK_OFF; 120 }
124 FPU_code_access_ok(4);
125 FPU_get_user(displacement, (long __user *) (*fpu_eip));
126 offset += displacement;
127 RE_ENTRANT_CHECK_ON;
128 (*fpu_eip) += 4;
129 }
130
131 return offset;
132}
133 121
122 return offset;
123}
134 124
135static unsigned long vm86_segment(u_char segment, 125static unsigned long vm86_segment(u_char segment, struct address *addr)
136 struct address *addr)
137{ 126{
138 segment--; 127 segment--;
139#ifdef PARANOID 128#ifdef PARANOID
140 if ( segment > PREFIX_SS_ ) 129 if (segment > PREFIX_SS_) {
141 { 130 EXCEPTION(EX_INTERNAL | 0x130);
142 EXCEPTION(EX_INTERNAL|0x130); 131 math_abort(FPU_info, SIGSEGV);
143 math_abort(FPU_info,SIGSEGV); 132 }
144 }
145#endif /* PARANOID */ 133#endif /* PARANOID */
146 addr->selector = VM86_REG_(segment); 134 addr->selector = VM86_REG_(segment);
147 return (unsigned long)VM86_REG_(segment) << 4; 135 return (unsigned long)VM86_REG_(segment) << 4;
148} 136}
149 137
150
151/* This should work for 16 and 32 bit protected mode. */ 138/* This should work for 16 and 32 bit protected mode. */
152static long pm_address(u_char FPU_modrm, u_char segment, 139static long pm_address(u_char FPU_modrm, u_char segment,
153 struct address *addr, long offset) 140 struct address *addr, long offset)
154{ 141{
155 struct desc_struct descriptor; 142 struct desc_struct descriptor;
156 unsigned long base_address, limit, address, seg_top; 143 unsigned long base_address, limit, address, seg_top;
157 144
158 segment--; 145 segment--;
159 146
160#ifdef PARANOID 147#ifdef PARANOID
161 /* segment is unsigned, so this also detects if segment was 0: */ 148 /* segment is unsigned, so this also detects if segment was 0: */
162 if ( segment > PREFIX_SS_ ) 149 if (segment > PREFIX_SS_) {
163 { 150 EXCEPTION(EX_INTERNAL | 0x132);
164 EXCEPTION(EX_INTERNAL|0x132); 151 math_abort(FPU_info, SIGSEGV);
165 math_abort(FPU_info,SIGSEGV); 152 }
166 }
167#endif /* PARANOID */ 153#endif /* PARANOID */
168 154
169 switch ( segment ) 155 switch (segment) {
170 { 156 /* gs isn't used by the kernel, so it still has its
171 /* gs isn't used by the kernel, so it still has its 157 user-space value. */
172 user-space value. */ 158 case PREFIX_GS_ - 1:
173 case PREFIX_GS_-1: 159 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
174 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ 160 savesegment(gs, addr->selector);
175 savesegment(gs, addr->selector); 161 break;
176 break; 162 default:
177 default: 163 addr->selector = PM_REG_(segment);
178 addr->selector = PM_REG_(segment);
179 }
180
181 descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
182 base_address = SEG_BASE_ADDR(descriptor);
183 address = base_address + offset;
184 limit = base_address
185 + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1;
186 if ( limit < base_address ) limit = 0xffffffff;
187
188 if ( SEG_EXPAND_DOWN(descriptor) )
189 {
190 if ( SEG_G_BIT(descriptor) )
191 seg_top = 0xffffffff;
192 else
193 {
194 seg_top = base_address + (1 << 20);
195 if ( seg_top < base_address ) seg_top = 0xffffffff;
196 } 164 }
197 access_limit =
198 (address <= limit) || (address >= seg_top) ? 0 :
199 ((seg_top-address) >= 255 ? 255 : seg_top-address);
200 }
201 else
202 {
203 access_limit =
204 (address > limit) || (address < base_address) ? 0 :
205 ((limit-address) >= 254 ? 255 : limit-address+1);
206 }
207 if ( SEG_EXECUTE_ONLY(descriptor) ||
208 (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) )
209 {
210 access_limit = 0;
211 }
212 return address;
213}
214 165
166 descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
167 base_address = SEG_BASE_ADDR(descriptor);
168 address = base_address + offset;
169 limit = base_address
170 + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1;
171 if (limit < base_address)
172 limit = 0xffffffff;
173
174 if (SEG_EXPAND_DOWN(descriptor)) {
175 if (SEG_G_BIT(descriptor))
176 seg_top = 0xffffffff;
177 else {
178 seg_top = base_address + (1 << 20);
179 if (seg_top < base_address)
180 seg_top = 0xffffffff;
181 }
182 access_limit =
183 (address <= limit) || (address >= seg_top) ? 0 :
184 ((seg_top - address) >= 255 ? 255 : seg_top - address);
185 } else {
186 access_limit =
187 (address > limit) || (address < base_address) ? 0 :
188 ((limit - address) >= 254 ? 255 : limit - address + 1);
189 }
190 if (SEG_EXECUTE_ONLY(descriptor) ||
191 (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
192 access_limit = 0;
193 }
194 return address;
195}
215 196
216/* 197/*
217 MOD R/M byte: MOD == 3 has a special use for the FPU 198 MOD R/M byte: MOD == 3 has a special use for the FPU
@@ -221,7 +202,6 @@ static long pm_address(u_char FPU_modrm, u_char segment,
221 ..... ......... ......... 202 ..... ......... .........
222 MOD OPCODE(2) R/M 203 MOD OPCODE(2) R/M
223 204
224
225 SIB byte 205 SIB byte
226 206
227 7 6 5 4 3 2 1 0 207 7 6 5 4 3 2 1 0
@@ -231,208 +211,194 @@ static long pm_address(u_char FPU_modrm, u_char segment,
231*/ 211*/
232 212
233void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, 213void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
234 struct address *addr, 214 struct address *addr, fpu_addr_modes addr_modes)
235 fpu_addr_modes addr_modes) 215{
216 u_char mod;
217 unsigned rm = FPU_modrm & 7;
218 long *cpu_reg_ptr;
219 int address = 0; /* Initialized just to stop compiler warnings. */
220
221 /* Memory accessed via the cs selector is write protected
222 in `non-segmented' 32 bit protected mode. */
223 if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
224 && (addr_modes.override.segment == PREFIX_CS_)) {
225 math_abort(FPU_info, SIGSEGV);
226 }
227
228 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
229
230 mod = (FPU_modrm >> 6) & 3;
231
232 if (rm == 4 && mod != 3) {
233 address = sib(mod, fpu_eip);
234 } else {
235 cpu_reg_ptr = &REG_(rm);
236 switch (mod) {
237 case 0:
238 if (rm == 5) {
239 /* Special case: disp32 */
240 RE_ENTRANT_CHECK_OFF;
241 FPU_code_access_ok(4);
242 FPU_get_user(address,
243 (unsigned long __user
244 *)(*fpu_eip));
245 (*fpu_eip) += 4;
246 RE_ENTRANT_CHECK_ON;
247 addr->offset = address;
248 return (void __user *)address;
249 } else {
250 address = *cpu_reg_ptr; /* Just return the contents
251 of the cpu register */
252 addr->offset = address;
253 return (void __user *)address;
254 }
255 case 1:
256 /* 8 bit signed displacement */
257 RE_ENTRANT_CHECK_OFF;
258 FPU_code_access_ok(1);
259 FPU_get_user(address, (signed char __user *)(*fpu_eip));
260 RE_ENTRANT_CHECK_ON;
261 (*fpu_eip)++;
262 break;
263 case 2:
264 /* 32 bit displacement */
265 RE_ENTRANT_CHECK_OFF;
266 FPU_code_access_ok(4);
267 FPU_get_user(address, (long __user *)(*fpu_eip));
268 (*fpu_eip) += 4;
269 RE_ENTRANT_CHECK_ON;
270 break;
271 case 3:
272 /* Not legal for the FPU */
273 EXCEPTION(EX_Invalid);
274 }
275 address += *cpu_reg_ptr;
276 }
277
278 addr->offset = address;
279
280 switch (addr_modes.default_mode) {
281 case 0:
282 break;
283 case VM86:
284 address += vm86_segment(addr_modes.override.segment, addr);
285 break;
286 case PM16:
287 case SEG32:
288 address = pm_address(FPU_modrm, addr_modes.override.segment,
289 addr, address);
290 break;
291 default:
292 EXCEPTION(EX_INTERNAL | 0x133);
293 }
294
295 return (void __user *)address;
296}
297
298void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
299 struct address *addr, fpu_addr_modes addr_modes)
236{ 300{
237 u_char mod; 301 u_char mod;
238 unsigned rm = FPU_modrm & 7; 302 unsigned rm = FPU_modrm & 7;
239 long *cpu_reg_ptr; 303 int address = 0; /* Default used for mod == 0 */
240 int address = 0; /* Initialized just to stop compiler warnings. */ 304
241 305 /* Memory accessed via the cs selector is write protected
242 /* Memory accessed via the cs selector is write protected 306 in `non-segmented' 32 bit protected mode. */
243 in `non-segmented' 32 bit protected mode. */ 307 if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
244 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) 308 && (addr_modes.override.segment == PREFIX_CS_)) {
245 && (addr_modes.override.segment == PREFIX_CS_) ) 309 math_abort(FPU_info, SIGSEGV);
246 { 310 }
247 math_abort(FPU_info,SIGSEGV); 311
248 } 312 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
249 313
250 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ 314 mod = (FPU_modrm >> 6) & 3;
251 315
252 mod = (FPU_modrm >> 6) & 3; 316 switch (mod) {
253
254 if (rm == 4 && mod != 3)
255 {
256 address = sib(mod, fpu_eip);
257 }
258 else
259 {
260 cpu_reg_ptr = & REG_(rm);
261 switch (mod)
262 {
263 case 0: 317 case 0:
264 if (rm == 5) 318 if (rm == 6) {
265 { 319 /* Special case: disp16 */
266 /* Special case: disp32 */ 320 RE_ENTRANT_CHECK_OFF;
267 RE_ENTRANT_CHECK_OFF; 321 FPU_code_access_ok(2);
268 FPU_code_access_ok(4); 322 FPU_get_user(address,
269 FPU_get_user(address, (unsigned long __user *) (*fpu_eip)); 323 (unsigned short __user *)(*fpu_eip));
270 (*fpu_eip) += 4; 324 (*fpu_eip) += 2;
271 RE_ENTRANT_CHECK_ON; 325 RE_ENTRANT_CHECK_ON;
272 addr->offset = address; 326 goto add_segment;
273 return (void __user *) address; 327 }
274 } 328 break;
275 else
276 {
277 address = *cpu_reg_ptr; /* Just return the contents
278 of the cpu register */
279 addr->offset = address;
280 return (void __user *) address;
281 }
282 case 1: 329 case 1:
283 /* 8 bit signed displacement */ 330 /* 8 bit signed displacement */
284 RE_ENTRANT_CHECK_OFF; 331 RE_ENTRANT_CHECK_OFF;
285 FPU_code_access_ok(1); 332 FPU_code_access_ok(1);
286 FPU_get_user(address, (signed char __user *) (*fpu_eip)); 333 FPU_get_user(address, (signed char __user *)(*fpu_eip));
287 RE_ENTRANT_CHECK_ON; 334 RE_ENTRANT_CHECK_ON;
288 (*fpu_eip)++; 335 (*fpu_eip)++;
289 break; 336 break;
290 case 2: 337 case 2:
291 /* 32 bit displacement */ 338 /* 16 bit displacement */
292 RE_ENTRANT_CHECK_OFF; 339 RE_ENTRANT_CHECK_OFF;
293 FPU_code_access_ok(4); 340 FPU_code_access_ok(2);
294 FPU_get_user(address, (long __user *) (*fpu_eip)); 341 FPU_get_user(address, (unsigned short __user *)(*fpu_eip));
295 (*fpu_eip) += 4; 342 (*fpu_eip) += 2;
296 RE_ENTRANT_CHECK_ON; 343 RE_ENTRANT_CHECK_ON;
297 break; 344 break;
298 case 3: 345 case 3:
299 /* Not legal for the FPU */ 346 /* Not legal for the FPU */
300 EXCEPTION(EX_Invalid); 347 EXCEPTION(EX_Invalid);
348 break;
349 }
350 switch (rm) {
351 case 0:
352 address += FPU_info->___ebx + FPU_info->___esi;
353 break;
354 case 1:
355 address += FPU_info->___ebx + FPU_info->___edi;
356 break;
357 case 2:
358 address += FPU_info->___ebp + FPU_info->___esi;
359 if (addr_modes.override.segment == PREFIX_DEFAULT)
360 addr_modes.override.segment = PREFIX_SS_;
361 break;
362 case 3:
363 address += FPU_info->___ebp + FPU_info->___edi;
364 if (addr_modes.override.segment == PREFIX_DEFAULT)
365 addr_modes.override.segment = PREFIX_SS_;
366 break;
367 case 4:
368 address += FPU_info->___esi;
369 break;
370 case 5:
371 address += FPU_info->___edi;
372 break;
373 case 6:
374 address += FPU_info->___ebp;
375 if (addr_modes.override.segment == PREFIX_DEFAULT)
376 addr_modes.override.segment = PREFIX_SS_;
377 break;
378 case 7:
379 address += FPU_info->___ebx;
380 break;
301 } 381 }
302 address += *cpu_reg_ptr;
303 }
304
305 addr->offset = address;
306
307 switch ( addr_modes.default_mode )
308 {
309 case 0:
310 break;
311 case VM86:
312 address += vm86_segment(addr_modes.override.segment, addr);
313 break;
314 case PM16:
315 case SEG32:
316 address = pm_address(FPU_modrm, addr_modes.override.segment,
317 addr, address);
318 break;
319 default:
320 EXCEPTION(EX_INTERNAL|0x133);
321 }
322
323 return (void __user *)address;
324}
325 382
383 add_segment:
384 address &= 0xffff;
326 385
327void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, 386 addr->offset = address;
328 struct address *addr, 387
329 fpu_addr_modes addr_modes) 388 switch (addr_modes.default_mode) {
330{ 389 case 0:
331 u_char mod; 390 break;
332 unsigned rm = FPU_modrm & 7; 391 case VM86:
333 int address = 0; /* Default used for mod == 0 */ 392 address += vm86_segment(addr_modes.override.segment, addr);
334 393 break;
335 /* Memory accessed via the cs selector is write protected 394 case PM16:
336 in `non-segmented' 32 bit protected mode. */ 395 case SEG32:
337 if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) 396 address = pm_address(FPU_modrm, addr_modes.override.segment,
338 && (addr_modes.override.segment == PREFIX_CS_) ) 397 addr, address);
339 { 398 break;
340 math_abort(FPU_info,SIGSEGV); 399 default:
341 } 400 EXCEPTION(EX_INTERNAL | 0x131);
342
343 addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
344
345 mod = (FPU_modrm >> 6) & 3;
346
347 switch (mod)
348 {
349 case 0:
350 if (rm == 6)
351 {
352 /* Special case: disp16 */
353 RE_ENTRANT_CHECK_OFF;
354 FPU_code_access_ok(2);
355 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
356 (*fpu_eip) += 2;
357 RE_ENTRANT_CHECK_ON;
358 goto add_segment;
359 } 401 }
360 break; 402
361 case 1: 403 return (void __user *)address;
362 /* 8 bit signed displacement */
363 RE_ENTRANT_CHECK_OFF;
364 FPU_code_access_ok(1);
365 FPU_get_user(address, (signed char __user *) (*fpu_eip));
366 RE_ENTRANT_CHECK_ON;
367 (*fpu_eip)++;
368 break;
369 case 2:
370 /* 16 bit displacement */
371 RE_ENTRANT_CHECK_OFF;
372 FPU_code_access_ok(2);
373 FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
374 (*fpu_eip) += 2;
375 RE_ENTRANT_CHECK_ON;
376 break;
377 case 3:
378 /* Not legal for the FPU */
379 EXCEPTION(EX_Invalid);
380 break;
381 }
382 switch ( rm )
383 {
384 case 0:
385 address += FPU_info->___ebx + FPU_info->___esi;
386 break;
387 case 1:
388 address += FPU_info->___ebx + FPU_info->___edi;
389 break;
390 case 2:
391 address += FPU_info->___ebp + FPU_info->___esi;
392 if ( addr_modes.override.segment == PREFIX_DEFAULT )
393 addr_modes.override.segment = PREFIX_SS_;
394 break;
395 case 3:
396 address += FPU_info->___ebp + FPU_info->___edi;
397 if ( addr_modes.override.segment == PREFIX_DEFAULT )
398 addr_modes.override.segment = PREFIX_SS_;
399 break;
400 case 4:
401 address += FPU_info->___esi;
402 break;
403 case 5:
404 address += FPU_info->___edi;
405 break;
406 case 6:
407 address += FPU_info->___ebp;
408 if ( addr_modes.override.segment == PREFIX_DEFAULT )
409 addr_modes.override.segment = PREFIX_SS_;
410 break;
411 case 7:
412 address += FPU_info->___ebx;
413 break;
414 }
415
416 add_segment:
417 address &= 0xffff;
418
419 addr->offset = address;
420
421 switch ( addr_modes.default_mode )
422 {
423 case 0:
424 break;
425 case VM86:
426 address += vm86_segment(addr_modes.override.segment, addr);
427 break;
428 case PM16:
429 case SEG32:
430 address = pm_address(FPU_modrm, addr_modes.override.segment,
431 addr, address);
432 break;
433 default:
434 EXCEPTION(EX_INTERNAL|0x131);
435 }
436
437 return (void __user *)address ;
438} 404}
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index eebd6fb1c8a8..2931ff355218 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -26,247 +26,257 @@
26#include "status_w.h" 26#include "status_w.h"
27#include "control_w.h" 27#include "control_w.h"
28 28
29 29#define _NONE_ 0 /* st0_ptr etc not needed */
30#define _NONE_ 0 /* st0_ptr etc not needed */ 30#define _REG0_ 1 /* Will be storing st(0) */
31#define _REG0_ 1 /* Will be storing st(0) */ 31#define _PUSH_ 3 /* Need to check for space to push onto stack */
32#define _PUSH_ 3 /* Need to check for space to push onto stack */ 32#define _null_ 4 /* Function illegal or not implemented */
33#define _null_ 4 /* Function illegal or not implemented */
34 33
35#define pop_0() { FPU_settag0(TAG_Empty); top++; } 34#define pop_0() { FPU_settag0(TAG_Empty); top++; }
36 35
37
38static u_char const type_table[32] = { 36static u_char const type_table[32] = {
39 _PUSH_, _PUSH_, _PUSH_, _PUSH_, 37 _PUSH_, _PUSH_, _PUSH_, _PUSH_,
40 _null_, _null_, _null_, _null_, 38 _null_, _null_, _null_, _null_,
41 _REG0_, _REG0_, _REG0_, _REG0_, 39 _REG0_, _REG0_, _REG0_, _REG0_,
42 _REG0_, _REG0_, _REG0_, _REG0_, 40 _REG0_, _REG0_, _REG0_, _REG0_,
43 _NONE_, _null_, _NONE_, _PUSH_, 41 _NONE_, _null_, _NONE_, _PUSH_,
44 _NONE_, _PUSH_, _null_, _PUSH_, 42 _NONE_, _PUSH_, _null_, _PUSH_,
45 _NONE_, _null_, _NONE_, _REG0_, 43 _NONE_, _null_, _NONE_, _REG0_,
46 _NONE_, _REG0_, _NONE_, _REG0_ 44 _NONE_, _REG0_, _NONE_, _REG0_
47 }; 45};
48 46
49u_char const data_sizes_16[32] = { 47u_char const data_sizes_16[32] = {
50 4, 4, 8, 2, 0, 0, 0, 0, 48 4, 4, 8, 2, 0, 0, 0, 0,
51 4, 4, 8, 2, 4, 4, 8, 2, 49 4, 4, 8, 2, 4, 4, 8, 2,
52 14, 0, 94, 10, 2, 10, 0, 8, 50 14, 0, 94, 10, 2, 10, 0, 8,
53 14, 0, 94, 10, 2, 10, 2, 8 51 14, 0, 94, 10, 2, 10, 2, 8
54}; 52};
55 53
56static u_char const data_sizes_32[32] = { 54static u_char const data_sizes_32[32] = {
57 4, 4, 8, 2, 0, 0, 0, 0, 55 4, 4, 8, 2, 0, 0, 0, 0,
58 4, 4, 8, 2, 4, 4, 8, 2, 56 4, 4, 8, 2, 4, 4, 8, 2,
59 28, 0,108, 10, 2, 10, 0, 8, 57 28, 0, 108, 10, 2, 10, 0, 8,
60 28, 0,108, 10, 2, 10, 2, 8 58 28, 0, 108, 10, 2, 10, 2, 8
61}; 59};
62 60
63int FPU_load_store(u_char type, fpu_addr_modes addr_modes, 61int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
64 void __user *data_address) 62 void __user * data_address)
65{ 63{
66 FPU_REG loaded_data; 64 FPU_REG loaded_data;
67 FPU_REG *st0_ptr; 65 FPU_REG *st0_ptr;
68 u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ 66 u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */
69 u_char loaded_tag; 67 u_char loaded_tag;
70 68
71 st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ 69 st0_ptr = NULL; /* Initialized just to stop compiler warnings. */
72 70
73 if ( addr_modes.default_mode & PROTECTED ) 71 if (addr_modes.default_mode & PROTECTED) {
74 { 72 if (addr_modes.default_mode == SEG32) {
75 if ( addr_modes.default_mode == SEG32 ) 73 if (access_limit < data_sizes_32[type])
76 { 74 math_abort(FPU_info, SIGSEGV);
77 if ( access_limit < data_sizes_32[type] ) 75 } else if (addr_modes.default_mode == PM16) {
78 math_abort(FPU_info,SIGSEGV); 76 if (access_limit < data_sizes_16[type])
79 } 77 math_abort(FPU_info, SIGSEGV);
80 else if ( addr_modes.default_mode == PM16 ) 78 }
81 {
82 if ( access_limit < data_sizes_16[type] )
83 math_abort(FPU_info,SIGSEGV);
84 }
85#ifdef PARANOID 79#ifdef PARANOID
86 else 80 else
87 EXCEPTION(EX_INTERNAL|0x140); 81 EXCEPTION(EX_INTERNAL | 0x140);
88#endif /* PARANOID */ 82#endif /* PARANOID */
89 } 83 }
90 84
91 switch ( type_table[type] ) 85 switch (type_table[type]) {
92 { 86 case _NONE_:
93 case _NONE_: 87 break;
94 break; 88 case _REG0_:
95 case _REG0_: 89 st0_ptr = &st(0); /* Some of these instructions pop after
96 st0_ptr = &st(0); /* Some of these instructions pop after 90 storing */
97 storing */ 91 st0_tag = FPU_gettag0();
98 st0_tag = FPU_gettag0(); 92 break;
99 break; 93 case _PUSH_:
100 case _PUSH_: 94 {
101 { 95 if (FPU_gettagi(-1) != TAG_Empty) {
102 if ( FPU_gettagi(-1) != TAG_Empty ) 96 FPU_stack_overflow();
103 { FPU_stack_overflow(); return 0; } 97 return 0;
104 top--; 98 }
105 st0_ptr = &st(0); 99 top--;
106 } 100 st0_ptr = &st(0);
107 break; 101 }
108 case _null_: 102 break;
109 FPU_illegal(); 103 case _null_:
110 return 0; 104 FPU_illegal();
105 return 0;
111#ifdef PARANOID 106#ifdef PARANOID
112 default: 107 default:
113 EXCEPTION(EX_INTERNAL|0x141); 108 EXCEPTION(EX_INTERNAL | 0x141);
114 return 0; 109 return 0;
115#endif /* PARANOID */ 110#endif /* PARANOID */
116 }
117
118 switch ( type )
119 {
120 case 000: /* fld m32real */
121 clear_C1();
122 loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data);
123 if ( (loaded_tag == TAG_Special)
124 && isNaN(&loaded_data)
125 && (real_1op_NaN(&loaded_data) < 0) )
126 {
127 top++;
128 break;
129 }
130 FPU_copy_to_reg0(&loaded_data, loaded_tag);
131 break;
132 case 001: /* fild m32int */
133 clear_C1();
134 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
135 FPU_copy_to_reg0(&loaded_data, loaded_tag);
136 break;
137 case 002: /* fld m64real */
138 clear_C1();
139 loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data);
140 if ( (loaded_tag == TAG_Special)
141 && isNaN(&loaded_data)
142 && (real_1op_NaN(&loaded_data) < 0) )
143 {
144 top++;
145 break;
146 } 111 }
147 FPU_copy_to_reg0(&loaded_data, loaded_tag); 112
148 break; 113 switch (type) {
149 case 003: /* fild m16int */ 114 case 000: /* fld m32real */
150 clear_C1(); 115 clear_C1();
151 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); 116 loaded_tag =
152 FPU_copy_to_reg0(&loaded_data, loaded_tag); 117 FPU_load_single((float __user *)data_address, &loaded_data);
153 break; 118 if ((loaded_tag == TAG_Special)
154 case 010: /* fst m32real */ 119 && isNaN(&loaded_data)
155 clear_C1(); 120 && (real_1op_NaN(&loaded_data) < 0)) {
156 FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address); 121 top++;
157 break; 122 break;
158 case 011: /* fist m32int */ 123 }
159 clear_C1(); 124 FPU_copy_to_reg0(&loaded_data, loaded_tag);
160 FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); 125 break;
161 break; 126 case 001: /* fild m32int */
162 case 012: /* fst m64real */ 127 clear_C1();
163 clear_C1(); 128 loaded_tag =
164 FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address); 129 FPU_load_int32((long __user *)data_address, &loaded_data);
165 break; 130 FPU_copy_to_reg0(&loaded_data, loaded_tag);
166 case 013: /* fist m16int */ 131 break;
167 clear_C1(); 132 case 002: /* fld m64real */
168 FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); 133 clear_C1();
169 break; 134 loaded_tag =
170 case 014: /* fstp m32real */ 135 FPU_load_double((double __user *)data_address,
171 clear_C1(); 136 &loaded_data);
172 if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) ) 137 if ((loaded_tag == TAG_Special)
173 pop_0(); /* pop only if the number was actually stored 138 && isNaN(&loaded_data)
174 (see the 80486 manual p16-28) */ 139 && (real_1op_NaN(&loaded_data) < 0)) {
175 break; 140 top++;
176 case 015: /* fistp m32int */ 141 break;
177 clear_C1(); 142 }
178 if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) ) 143 FPU_copy_to_reg0(&loaded_data, loaded_tag);
179 pop_0(); /* pop only if the number was actually stored 144 break;
180 (see the 80486 manual p16-28) */ 145 case 003: /* fild m16int */
181 break; 146 clear_C1();
182 case 016: /* fstp m64real */ 147 loaded_tag =
183 clear_C1(); 148 FPU_load_int16((short __user *)data_address, &loaded_data);
184 if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) ) 149 FPU_copy_to_reg0(&loaded_data, loaded_tag);
185 pop_0(); /* pop only if the number was actually stored 150 break;
186 (see the 80486 manual p16-28) */ 151 case 010: /* fst m32real */
187 break; 152 clear_C1();
188 case 017: /* fistp m16int */ 153 FPU_store_single(st0_ptr, st0_tag,
189 clear_C1(); 154 (float __user *)data_address);
190 if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) ) 155 break;
191 pop_0(); /* pop only if the number was actually stored 156 case 011: /* fist m32int */
192 (see the 80486 manual p16-28) */ 157 clear_C1();
193 break; 158 FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
194 case 020: /* fldenv m14/28byte */ 159 break;
195 fldenv(addr_modes, (u_char __user *)data_address); 160 case 012: /* fst m64real */
196 /* Ensure that the values just loaded are not changed by 161 clear_C1();
197 fix-up operations. */ 162 FPU_store_double(st0_ptr, st0_tag,
198 return 1; 163 (double __user *)data_address);
199 case 022: /* frstor m94/108byte */ 164 break;
200 frstor(addr_modes, (u_char __user *)data_address); 165 case 013: /* fist m16int */
201 /* Ensure that the values just loaded are not changed by 166 clear_C1();
202 fix-up operations. */ 167 FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
203 return 1; 168 break;
204 case 023: /* fbld m80dec */ 169 case 014: /* fstp m32real */
205 clear_C1(); 170 clear_C1();
206 loaded_tag = FPU_load_bcd((u_char __user *)data_address); 171 if (FPU_store_single
207 FPU_settag0(loaded_tag); 172 (st0_ptr, st0_tag, (float __user *)data_address))
208 break; 173 pop_0(); /* pop only if the number was actually stored
209 case 024: /* fldcw */ 174 (see the 80486 manual p16-28) */
210 RE_ENTRANT_CHECK_OFF; 175 break;
211 FPU_access_ok(VERIFY_READ, data_address, 2); 176 case 015: /* fistp m32int */
212 FPU_get_user(control_word, (unsigned short __user *) data_address); 177 clear_C1();
213 RE_ENTRANT_CHECK_ON; 178 if (FPU_store_int32
214 if ( partial_status & ~control_word & CW_Exceptions ) 179 (st0_ptr, st0_tag, (long __user *)data_address))
215 partial_status |= (SW_Summary | SW_Backward); 180 pop_0(); /* pop only if the number was actually stored
216 else 181 (see the 80486 manual p16-28) */
217 partial_status &= ~(SW_Summary | SW_Backward); 182 break;
183 case 016: /* fstp m64real */
184 clear_C1();
185 if (FPU_store_double
186 (st0_ptr, st0_tag, (double __user *)data_address))
187 pop_0(); /* pop only if the number was actually stored
188 (see the 80486 manual p16-28) */
189 break;
190 case 017: /* fistp m16int */
191 clear_C1();
192 if (FPU_store_int16
193 (st0_ptr, st0_tag, (short __user *)data_address))
194 pop_0(); /* pop only if the number was actually stored
195 (see the 80486 manual p16-28) */
196 break;
197 case 020: /* fldenv m14/28byte */
198 fldenv(addr_modes, (u_char __user *) data_address);
199 /* Ensure that the values just loaded are not changed by
200 fix-up operations. */
201 return 1;
202 case 022: /* frstor m94/108byte */
203 frstor(addr_modes, (u_char __user *) data_address);
204 /* Ensure that the values just loaded are not changed by
205 fix-up operations. */
206 return 1;
207 case 023: /* fbld m80dec */
208 clear_C1();
209 loaded_tag = FPU_load_bcd((u_char __user *) data_address);
210 FPU_settag0(loaded_tag);
211 break;
212 case 024: /* fldcw */
213 RE_ENTRANT_CHECK_OFF;
214 FPU_access_ok(VERIFY_READ, data_address, 2);
215 FPU_get_user(control_word,
216 (unsigned short __user *)data_address);
217 RE_ENTRANT_CHECK_ON;
218 if (partial_status & ~control_word & CW_Exceptions)
219 partial_status |= (SW_Summary | SW_Backward);
220 else
221 partial_status &= ~(SW_Summary | SW_Backward);
218#ifdef PECULIAR_486 222#ifdef PECULIAR_486
219 control_word |= 0x40; /* An 80486 appears to always set this bit */ 223 control_word |= 0x40; /* An 80486 appears to always set this bit */
220#endif /* PECULIAR_486 */ 224#endif /* PECULIAR_486 */
221 return 1; 225 return 1;
222 case 025: /* fld m80real */ 226 case 025: /* fld m80real */
223 clear_C1(); 227 clear_C1();
224 loaded_tag = FPU_load_extended((long double __user *)data_address, 0); 228 loaded_tag =
225 FPU_settag0(loaded_tag); 229 FPU_load_extended((long double __user *)data_address, 0);
226 break; 230 FPU_settag0(loaded_tag);
227 case 027: /* fild m64int */ 231 break;
228 clear_C1(); 232 case 027: /* fild m64int */
229 loaded_tag = FPU_load_int64((long long __user *)data_address); 233 clear_C1();
230 if (loaded_tag == TAG_Error) 234 loaded_tag = FPU_load_int64((long long __user *)data_address);
235 if (loaded_tag == TAG_Error)
236 return 0;
237 FPU_settag0(loaded_tag);
238 break;
239 case 030: /* fstenv m14/28byte */
240 fstenv(addr_modes, (u_char __user *) data_address);
241 return 1;
242 case 032: /* fsave */
243 fsave(addr_modes, (u_char __user *) data_address);
244 return 1;
245 case 033: /* fbstp m80dec */
246 clear_C1();
247 if (FPU_store_bcd
248 (st0_ptr, st0_tag, (u_char __user *) data_address))
249 pop_0(); /* pop only if the number was actually stored
250 (see the 80486 manual p16-28) */
251 break;
252 case 034: /* fstcw m16int */
253 RE_ENTRANT_CHECK_OFF;
254 FPU_access_ok(VERIFY_WRITE, data_address, 2);
255 FPU_put_user(control_word,
256 (unsigned short __user *)data_address);
257 RE_ENTRANT_CHECK_ON;
258 return 1;
259 case 035: /* fstp m80real */
260 clear_C1();
261 if (FPU_store_extended
262 (st0_ptr, st0_tag, (long double __user *)data_address))
263 pop_0(); /* pop only if the number was actually stored
264 (see the 80486 manual p16-28) */
265 break;
266 case 036: /* fstsw m2byte */
267 RE_ENTRANT_CHECK_OFF;
268 FPU_access_ok(VERIFY_WRITE, data_address, 2);
269 FPU_put_user(status_word(),
270 (unsigned short __user *)data_address);
271 RE_ENTRANT_CHECK_ON;
272 return 1;
273 case 037: /* fistp m64int */
274 clear_C1();
275 if (FPU_store_int64
276 (st0_ptr, st0_tag, (long long __user *)data_address))
277 pop_0(); /* pop only if the number was actually stored
278 (see the 80486 manual p16-28) */
279 break;
280 }
231 return 0; 281 return 0;
232 FPU_settag0(loaded_tag);
233 break;
234 case 030: /* fstenv m14/28byte */
235 fstenv(addr_modes, (u_char __user *)data_address);
236 return 1;
237 case 032: /* fsave */
238 fsave(addr_modes, (u_char __user *)data_address);
239 return 1;
240 case 033: /* fbstp m80dec */
241 clear_C1();
242 if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) )
243 pop_0(); /* pop only if the number was actually stored
244 (see the 80486 manual p16-28) */
245 break;
246 case 034: /* fstcw m16int */
247 RE_ENTRANT_CHECK_OFF;
248 FPU_access_ok(VERIFY_WRITE,data_address,2);
249 FPU_put_user(control_word, (unsigned short __user *) data_address);
250 RE_ENTRANT_CHECK_ON;
251 return 1;
252 case 035: /* fstp m80real */
253 clear_C1();
254 if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) )
255 pop_0(); /* pop only if the number was actually stored
256 (see the 80486 manual p16-28) */
257 break;
258 case 036: /* fstsw m2byte */
259 RE_ENTRANT_CHECK_OFF;
260 FPU_access_ok(VERIFY_WRITE,data_address,2);
261 FPU_put_user(status_word(),(unsigned short __user *) data_address);
262 RE_ENTRANT_CHECK_ON;
263 return 1;
264 case 037: /* fistp m64int */
265 clear_C1();
266 if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) )
267 pop_0(); /* pop only if the number was actually stored
268 (see the 80486 manual p16-28) */
269 break;
270 }
271 return 0;
272} 282}
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
index 4db798114923..168eb44c93c8 100644
--- a/arch/x86/math-emu/poly.h
+++ b/arch/x86/math-emu/poly.h
@@ -21,9 +21,9 @@
21 allows. 9-byte would probably be sufficient. 21 allows. 9-byte would probably be sufficient.
22 */ 22 */
23typedef struct { 23typedef struct {
24 unsigned long lsw; 24 unsigned long lsw;
25 unsigned long midw; 25 unsigned long midw;
26 unsigned long msw; 26 unsigned long msw;
27} Xsig; 27} Xsig;
28 28
29asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b, 29asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
@@ -49,7 +49,6 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
49/* Macro to access the 8 ms bytes of an Xsig as a long long */ 49/* Macro to access the 8 ms bytes of an Xsig as a long long */
50#define XSIG_LL(x) (*(unsigned long long *)&x.midw) 50#define XSIG_LL(x) (*(unsigned long long *)&x.midw)
51 51
52
53/* 52/*
54 Need to run gcc with optimizations on to get these to 53 Need to run gcc with optimizations on to get these to
55 actually be in-line. 54 actually be in-line.
@@ -63,59 +62,53 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
63static inline unsigned long mul_32_32(const unsigned long arg1, 62static inline unsigned long mul_32_32(const unsigned long arg1,
64 const unsigned long arg2) 63 const unsigned long arg2)
65{ 64{
66 int retval; 65 int retval;
67 asm volatile ("mull %2; movl %%edx,%%eax" \ 66 asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval)
68 :"=a" (retval) \ 67 :"0"(arg1), "g"(arg2)
69 :"0" (arg1), "g" (arg2) \ 68 :"dx");
70 :"dx"); 69 return retval;
71 return retval;
72} 70}
73 71
74
75/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */ 72/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
76static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2) 73static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
77{ 74{
78 asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" 75 asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
79 "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" 76 "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
80 "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" 77 "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
81 "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n" 78 "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g"
82 :"=g" (*dest):"g" (dest), "g" (x2) 79 (*dest):"g"(dest), "g"(x2)
83 :"ax","si","di"); 80 :"ax", "si", "di");
84} 81}
85 82
86
87/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */ 83/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
88/* Note: the constraints in the asm statement didn't always work properly 84/* Note: the constraints in the asm statement didn't always work properly
89 with gcc 2.5.8. Changing from using edi to using ecx got around the 85 with gcc 2.5.8. Changing from using edi to using ecx got around the
90 problem, but keep fingers crossed! */ 86 problem, but keep fingers crossed! */
91static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp) 87static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
92{ 88{
93 asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" 89 asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
94 "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" 90 "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
95 "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" 91 "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
96 "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" 92 "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
97 "jnc 0f;\n" 93 "jnc 0f;\n"
98 "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" 94 "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
99 "movl %4,%%ecx; incl (%%ecx)\n" 95 "movl %4,%%ecx; incl (%%ecx)\n"
100 "movl $1,%%eax; jmp 1f;\n" 96 "movl $1,%%eax; jmp 1f;\n"
101 "0: xorl %%eax,%%eax;\n" 97 "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest)
102 "1:\n" 98 :"g"(dest), "g"(x2), "g"(exp)
103 :"=g" (*exp), "=g" (*dest) 99 :"cx", "si", "ax");
104 :"g" (dest), "g" (x2), "g" (exp)
105 :"cx","si","ax");
106} 100}
107 101
108
109/* Negate (subtract from 1.0) the 12 byte Xsig */ 102/* Negate (subtract from 1.0) the 12 byte Xsig */
110/* This is faster in a loop on my 386 than using the "neg" instruction. */ 103/* This is faster in a loop on my 386 than using the "neg" instruction. */
111static inline void negate_Xsig(Xsig *x) 104static inline void negate_Xsig(Xsig *x)
112{ 105{
113 asm volatile("movl %1,%%esi;\n" 106 asm volatile ("movl %1,%%esi;\n"
114 "xorl %%ecx,%%ecx;\n" 107 "xorl %%ecx,%%ecx;\n"
115 "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" 108 "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
116 "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" 109 "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
117 "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n" 110 "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g"
118 :"=g" (*x):"g" (x):"si","ax","cx"); 111 (*x):"g"(x):"si", "ax", "cx");
119} 112}
120 113
121#endif /* _POLY_H */ 114#endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c
index 9766ad5e9743..b00e9e10cdce 100644
--- a/arch/x86/math-emu/poly_2xm1.c
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -17,21 +17,19 @@
17#include "control_w.h" 17#include "control_w.h"
18#include "poly.h" 18#include "poly.h"
19 19
20
21#define HIPOWER 11 20#define HIPOWER 11
22static const unsigned long long lterms[HIPOWER] = 21static const unsigned long long lterms[HIPOWER] = {
23{ 22 0x0000000000000000LL, /* This term done separately as 12 bytes */
24 0x0000000000000000LL, /* This term done separately as 12 bytes */ 23 0xf5fdeffc162c7543LL,
25 0xf5fdeffc162c7543LL, 24 0x1c6b08d704a0bfa6LL,
26 0x1c6b08d704a0bfa6LL, 25 0x0276556df749cc21LL,
27 0x0276556df749cc21LL, 26 0x002bb0ffcf14f6b8LL,
28 0x002bb0ffcf14f6b8LL, 27 0x0002861225ef751cLL,
29 0x0002861225ef751cLL, 28 0x00001ffcbfcd5422LL,
30 0x00001ffcbfcd5422LL, 29 0x00000162c005d5f1LL,
31 0x00000162c005d5f1LL, 30 0x0000000da96ccb1bLL,
32 0x0000000da96ccb1bLL, 31 0x0000000078d1b897LL,
33 0x0000000078d1b897LL, 32 0x000000000422b029LL
34 0x000000000422b029LL
35}; 33};
36 34
37static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194); 35static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
@@ -45,112 +43,103 @@ static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
45static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9); 43static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
46 44
47static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1, 45static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
48 &shiftterm2, &shiftterm3 }; 46 &shiftterm2, &shiftterm3
49 47};
50 48
51/*--- poly_2xm1() -----------------------------------------------------------+ 49/*--- poly_2xm1() -----------------------------------------------------------+
52 | Requires st(0) which is TAG_Valid and < 1. | 50 | Requires st(0) which is TAG_Valid and < 1. |
53 +---------------------------------------------------------------------------*/ 51 +---------------------------------------------------------------------------*/
54int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) 52int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
55{ 53{
56 long int exponent, shift; 54 long int exponent, shift;
57 unsigned long long Xll; 55 unsigned long long Xll;
58 Xsig accumulator, Denom, argSignif; 56 Xsig accumulator, Denom, argSignif;
59 u_char tag; 57 u_char tag;
60 58
61 exponent = exponent16(arg); 59 exponent = exponent16(arg);
62 60
63#ifdef PARANOID 61#ifdef PARANOID
64 if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */ 62 if (exponent >= 0) { /* Don't want a |number| >= 1.0 */
65 { 63 /* Number negative, too large, or not Valid. */
66 /* Number negative, too large, or not Valid. */ 64 EXCEPTION(EX_INTERNAL | 0x127);
67 EXCEPTION(EX_INTERNAL|0x127); 65 return 1;
68 return 1; 66 }
69 }
70#endif /* PARANOID */ 67#endif /* PARANOID */
71 68
72 argSignif.lsw = 0; 69 argSignif.lsw = 0;
73 XSIG_LL(argSignif) = Xll = significand(arg); 70 XSIG_LL(argSignif) = Xll = significand(arg);
74 71
75 if ( exponent == -1 ) 72 if (exponent == -1) {
76 { 73 shift = (argSignif.msw & 0x40000000) ? 3 : 2;
77 shift = (argSignif.msw & 0x40000000) ? 3 : 2; 74 /* subtract 0.5 or 0.75 */
78 /* subtract 0.5 or 0.75 */ 75 exponent -= 2;
79 exponent -= 2; 76 XSIG_LL(argSignif) <<= 2;
80 XSIG_LL(argSignif) <<= 2; 77 Xll <<= 2;
81 Xll <<= 2; 78 } else if (exponent == -2) {
82 } 79 shift = 1;
83 else if ( exponent == -2 ) 80 /* subtract 0.25 */
84 { 81 exponent--;
85 shift = 1; 82 XSIG_LL(argSignif) <<= 1;
86 /* subtract 0.25 */ 83 Xll <<= 1;
87 exponent--; 84 } else
88 XSIG_LL(argSignif) <<= 1; 85 shift = 0;
89 Xll <<= 1; 86
90 } 87 if (exponent < -2) {
91 else 88 /* Shift the argument right by the required places. */
92 shift = 0; 89 if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U)
93 90 Xll++; /* round up */
94 if ( exponent < -2 ) 91 }
95 { 92
96 /* Shift the argument right by the required places. */ 93 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
97 if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U ) 94 polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1);
98 Xll++; /* round up */ 95 mul_Xsig_Xsig(&accumulator, &argSignif);
99 } 96 shr_Xsig(&accumulator, 3);
100 97
101 accumulator.lsw = accumulator.midw = accumulator.msw = 0; 98 mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */
102 polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1); 99 add_two_Xsig(&accumulator, &argSignif, &exponent);
103 mul_Xsig_Xsig(&accumulator, &argSignif); 100
104 shr_Xsig(&accumulator, 3); 101 if (shift) {
105 102 /* The argument is large, use the identity:
106 mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ 103 f(x+a) = f(a) * (f(x) + 1) - 1;
107 add_two_Xsig(&accumulator, &argSignif, &exponent); 104 */
108 105 shr_Xsig(&accumulator, -exponent);
109 if ( shift ) 106 accumulator.msw |= 0x80000000; /* add 1.0 */
110 { 107 mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
111 /* The argument is large, use the identity: 108 accumulator.msw &= 0x3fffffff; /* subtract 1.0 */
112 f(x+a) = f(a) * (f(x) + 1) - 1; 109 exponent = 1;
113 */ 110 }
114 shr_Xsig(&accumulator, - exponent); 111
115 accumulator.msw |= 0x80000000; /* add 1.0 */ 112 if (sign != SIGN_POS) {
116 mul_Xsig_Xsig(&accumulator, shiftterm[shift]); 113 /* The argument is negative, use the identity:
117 accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ 114 f(-x) = -f(x) / (1 + f(x))
118 exponent = 1; 115 */
119 } 116 Denom.lsw = accumulator.lsw;
120 117 XSIG_LL(Denom) = XSIG_LL(accumulator);
121 if ( sign != SIGN_POS ) 118 if (exponent < 0)
122 { 119 shr_Xsig(&Denom, -exponent);
123 /* The argument is negative, use the identity: 120 else if (exponent > 0) {
124 f(-x) = -f(x) / (1 + f(x)) 121 /* exponent must be 1 here */
125 */ 122 XSIG_LL(Denom) <<= 1;
126 Denom.lsw = accumulator.lsw; 123 if (Denom.lsw & 0x80000000)
127 XSIG_LL(Denom) = XSIG_LL(accumulator); 124 XSIG_LL(Denom) |= 1;
128 if ( exponent < 0 ) 125 (Denom.lsw) <<= 1;
129 shr_Xsig(&Denom, - exponent); 126 }
130 else if ( exponent > 0 ) 127 Denom.msw |= 0x80000000; /* add 1.0 */
131 { 128 div_Xsig(&accumulator, &Denom, &accumulator);
132 /* exponent must be 1 here */
133 XSIG_LL(Denom) <<= 1;
134 if ( Denom.lsw & 0x80000000 )
135 XSIG_LL(Denom) |= 1;
136 (Denom.lsw) <<= 1;
137 } 129 }
138 Denom.msw |= 0x80000000; /* add 1.0 */
139 div_Xsig(&accumulator, &Denom, &accumulator);
140 }
141 130
142 /* Convert to 64 bit signed-compatible */ 131 /* Convert to 64 bit signed-compatible */
143 exponent += round_Xsig(&accumulator); 132 exponent += round_Xsig(&accumulator);
144 133
145 result = &st(0); 134 result = &st(0);
146 significand(result) = XSIG_LL(accumulator); 135 significand(result) = XSIG_LL(accumulator);
147 setexponent16(result, exponent); 136 setexponent16(result, exponent);
148 137
149 tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); 138 tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
150 139
151 setsign(result, sign); 140 setsign(result, sign);
152 FPU_settag0(tag); 141 FPU_settag0(tag);
153 142
154 return 0; 143 return 0;
155 144
156} 145}
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c
index 82f702952f69..20c28e58e2d4 100644
--- a/arch/x86/math-emu/poly_atan.c
+++ b/arch/x86/math-emu/poly_atan.c
@@ -18,28 +18,25 @@
18#include "control_w.h" 18#include "control_w.h"
19#include "poly.h" 19#include "poly.h"
20 20
21
22#define HIPOWERon 6 /* odd poly, negative terms */ 21#define HIPOWERon 6 /* odd poly, negative terms */
23static const unsigned long long oddnegterms[HIPOWERon] = 22static const unsigned long long oddnegterms[HIPOWERon] = {
24{ 23 0x0000000000000000LL, /* Dummy (not for - 1.0) */
25 0x0000000000000000LL, /* Dummy (not for - 1.0) */ 24 0x015328437f756467LL,
26 0x015328437f756467LL, 25 0x0005dda27b73dec6LL,
27 0x0005dda27b73dec6LL, 26 0x0000226bf2bfb91aLL,
28 0x0000226bf2bfb91aLL, 27 0x000000ccc439c5f7LL,
29 0x000000ccc439c5f7LL, 28 0x0000000355438407LL
30 0x0000000355438407LL 29};
31} ;
32 30
33#define HIPOWERop 6 /* odd poly, positive terms */ 31#define HIPOWERop 6 /* odd poly, positive terms */
34static const unsigned long long oddplterms[HIPOWERop] = 32static const unsigned long long oddplterms[HIPOWERop] = {
35{
36/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */ 33/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */
37 0x0db55a71875c9ac2LL, 34 0x0db55a71875c9ac2LL,
38 0x0029fce2d67880b0LL, 35 0x0029fce2d67880b0LL,
39 0x0000dfd3908b4596LL, 36 0x0000dfd3908b4596LL,
40 0x00000550fd61dab4LL, 37 0x00000550fd61dab4LL,
41 0x0000001c9422b3f9LL, 38 0x0000001c9422b3f9LL,
42 0x000000003e3301e1LL 39 0x000000003e3301e1LL
43}; 40};
44 41
45static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL; 42static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
@@ -48,182 +45,164 @@ static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
48 45
49static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b); 46static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
50 47
51
52/*--- poly_atan() -----------------------------------------------------------+ 48/*--- poly_atan() -----------------------------------------------------------+
53 | | 49 | |
54 +---------------------------------------------------------------------------*/ 50 +---------------------------------------------------------------------------*/
55void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, 51void poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
56 FPU_REG *st1_ptr, u_char st1_tag) 52 FPU_REG *st1_ptr, u_char st1_tag)
57{ 53{
58 u_char transformed, inverted, 54 u_char transformed, inverted, sign1, sign2;
59 sign1, sign2; 55 int exponent;
60 int exponent; 56 long int dummy_exp;
61 long int dummy_exp; 57 Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq;
62 Xsig accumulator, Numer, Denom, accumulatore, argSignif, 58 u_char tag;
63 argSq, argSqSq; 59
64 u_char tag; 60 sign1 = getsign(st0_ptr);
65 61 sign2 = getsign(st1_ptr);
66 sign1 = getsign(st0_ptr); 62 if (st0_tag == TAG_Valid) {
67 sign2 = getsign(st1_ptr); 63 exponent = exponent(st0_ptr);
68 if ( st0_tag == TAG_Valid ) 64 } else {
69 { 65 /* This gives non-compatible stack contents... */
70 exponent = exponent(st0_ptr); 66 FPU_to_exp16(st0_ptr, st0_ptr);
71 } 67 exponent = exponent16(st0_ptr);
72 else 68 }
73 { 69 if (st1_tag == TAG_Valid) {
74 /* This gives non-compatible stack contents... */ 70 exponent -= exponent(st1_ptr);
75 FPU_to_exp16(st0_ptr, st0_ptr); 71 } else {
76 exponent = exponent16(st0_ptr); 72 /* This gives non-compatible stack contents... */
77 } 73 FPU_to_exp16(st1_ptr, st1_ptr);
78 if ( st1_tag == TAG_Valid ) 74 exponent -= exponent16(st1_ptr);
79 { 75 }
80 exponent -= exponent(st1_ptr); 76
81 } 77 if ((exponent < 0) || ((exponent == 0) &&
82 else 78 ((st0_ptr->sigh < st1_ptr->sigh) ||
83 { 79 ((st0_ptr->sigh == st1_ptr->sigh) &&
84 /* This gives non-compatible stack contents... */ 80 (st0_ptr->sigl < st1_ptr->sigl))))) {
85 FPU_to_exp16(st1_ptr, st1_ptr); 81 inverted = 1;
86 exponent -= exponent16(st1_ptr); 82 Numer.lsw = Denom.lsw = 0;
87 } 83 XSIG_LL(Numer) = significand(st0_ptr);
88 84 XSIG_LL(Denom) = significand(st1_ptr);
89 if ( (exponent < 0) || ((exponent == 0) && 85 } else {
90 ((st0_ptr->sigh < st1_ptr->sigh) || 86 inverted = 0;
91 ((st0_ptr->sigh == st1_ptr->sigh) && 87 exponent = -exponent;
92 (st0_ptr->sigl < st1_ptr->sigl))) ) ) 88 Numer.lsw = Denom.lsw = 0;
93 { 89 XSIG_LL(Numer) = significand(st1_ptr);
94 inverted = 1; 90 XSIG_LL(Denom) = significand(st0_ptr);
95 Numer.lsw = Denom.lsw = 0; 91 }
96 XSIG_LL(Numer) = significand(st0_ptr); 92 div_Xsig(&Numer, &Denom, &argSignif);
97 XSIG_LL(Denom) = significand(st1_ptr); 93 exponent += norm_Xsig(&argSignif);
98 } 94
99 else 95 if ((exponent >= -1)
100 { 96 || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) {
101 inverted = 0; 97 /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
102 exponent = -exponent; 98 /* Convert the argument by an identity for atan */
103 Numer.lsw = Denom.lsw = 0; 99 transformed = 1;
104 XSIG_LL(Numer) = significand(st1_ptr); 100
105 XSIG_LL(Denom) = significand(st0_ptr); 101 if (exponent >= 0) {
106 }
107 div_Xsig(&Numer, &Denom, &argSignif);
108 exponent += norm_Xsig(&argSignif);
109
110 if ( (exponent >= -1)
111 || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) )
112 {
113 /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
114 /* Convert the argument by an identity for atan */
115 transformed = 1;
116
117 if ( exponent >= 0 )
118 {
119#ifdef PARANOID 102#ifdef PARANOID
120 if ( !( (exponent == 0) && 103 if (!((exponent == 0) &&
121 (argSignif.lsw == 0) && (argSignif.midw == 0) && 104 (argSignif.lsw == 0) && (argSignif.midw == 0) &&
122 (argSignif.msw == 0x80000000) ) ) 105 (argSignif.msw == 0x80000000))) {
123 { 106 EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */
124 EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */ 107 return;
125 return; 108 }
126 }
127#endif /* PARANOID */ 109#endif /* PARANOID */
128 argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ 110 argSignif.msw = 0; /* Make the transformed arg -> 0.0 */
111 } else {
112 Numer.lsw = Denom.lsw = argSignif.lsw;
113 XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
114
115 if (exponent < -1)
116 shr_Xsig(&Numer, -1 - exponent);
117 negate_Xsig(&Numer);
118
119 shr_Xsig(&Denom, -exponent);
120 Denom.msw |= 0x80000000;
121
122 div_Xsig(&Numer, &Denom, &argSignif);
123
124 exponent = -1 + norm_Xsig(&argSignif);
125 }
126 } else {
127 transformed = 0;
128 }
129
130 argSq.lsw = argSignif.lsw;
131 argSq.midw = argSignif.midw;
132 argSq.msw = argSignif.msw;
133 mul_Xsig_Xsig(&argSq, &argSq);
134
135 argSqSq.lsw = argSq.lsw;
136 argSqSq.midw = argSq.midw;
137 argSqSq.msw = argSq.msw;
138 mul_Xsig_Xsig(&argSqSq, &argSqSq);
139
140 accumulatore.lsw = argSq.lsw;
141 XSIG_LL(accumulatore) = XSIG_LL(argSq);
142
143 shr_Xsig(&argSq, 2 * (-1 - exponent - 1));
144 shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1));
145
146 /* Now have argSq etc with binary point at the left
147 .1xxxxxxxx */
148
149 /* Do the basic fixed point polynomial evaluation */
150 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
151 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
152 oddplterms, HIPOWERop - 1);
153 mul64_Xsig(&accumulator, &XSIG_LL(argSq));
154 negate_Xsig(&accumulator);
155 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms,
156 HIPOWERon - 1);
157 negate_Xsig(&accumulator);
158 add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
159
160 mul64_Xsig(&accumulatore, &denomterm);
161 shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent));
162 accumulatore.msw |= 0x80000000;
163
164 div_Xsig(&accumulator, &accumulatore, &accumulator);
165
166 mul_Xsig_Xsig(&accumulator, &argSignif);
167 mul_Xsig_Xsig(&accumulator, &argSq);
168
169 shr_Xsig(&accumulator, 3);
170 negate_Xsig(&accumulator);
171 add_Xsig_Xsig(&accumulator, &argSignif);
172
173 if (transformed) {
174 /* compute pi/4 - accumulator */
175 shr_Xsig(&accumulator, -1 - exponent);
176 negate_Xsig(&accumulator);
177 add_Xsig_Xsig(&accumulator, &pi_signif);
178 exponent = -1;
179 }
180
181 if (inverted) {
182 /* compute pi/2 - accumulator */
183 shr_Xsig(&accumulator, -exponent);
184 negate_Xsig(&accumulator);
185 add_Xsig_Xsig(&accumulator, &pi_signif);
186 exponent = 0;
129 } 187 }
130 else 188
131 { 189 if (sign1) {
132 Numer.lsw = Denom.lsw = argSignif.lsw; 190 /* compute pi - accumulator */
133 XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); 191 shr_Xsig(&accumulator, 1 - exponent);
134 192 negate_Xsig(&accumulator);
135 if ( exponent < -1 ) 193 add_Xsig_Xsig(&accumulator, &pi_signif);
136 shr_Xsig(&Numer, -1-exponent); 194 exponent = 1;
137 negate_Xsig(&Numer);
138
139 shr_Xsig(&Denom, -exponent);
140 Denom.msw |= 0x80000000;
141
142 div_Xsig(&Numer, &Denom, &argSignif);
143
144 exponent = -1 + norm_Xsig(&argSignif);
145 } 195 }
146 } 196
147 else 197 exponent += round_Xsig(&accumulator);
148 { 198
149 transformed = 0; 199 significand(st1_ptr) = XSIG_LL(accumulator);
150 } 200 setexponent16(st1_ptr, exponent);
151 201
152 argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw; 202 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
153 argSq.msw = argSignif.msw; 203 FPU_settagi(1, tag);
154 mul_Xsig_Xsig(&argSq, &argSq); 204
155 205 set_precision_flag_up(); /* We do not really know if up or down,
156 argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw; 206 use this as the default. */
157 mul_Xsig_Xsig(&argSqSq, &argSqSq);
158
159 accumulatore.lsw = argSq.lsw;
160 XSIG_LL(accumulatore) = XSIG_LL(argSq);
161
162 shr_Xsig(&argSq, 2*(-1-exponent-1));
163 shr_Xsig(&argSqSq, 4*(-1-exponent-1));
164
165 /* Now have argSq etc with binary point at the left
166 .1xxxxxxxx */
167
168 /* Do the basic fixed point polynomial evaluation */
169 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
170 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
171 oddplterms, HIPOWERop-1);
172 mul64_Xsig(&accumulator, &XSIG_LL(argSq));
173 negate_Xsig(&accumulator);
174 polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1);
175 negate_Xsig(&accumulator);
176 add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
177
178 mul64_Xsig(&accumulatore, &denomterm);
179 shr_Xsig(&accumulatore, 1 + 2*(-1-exponent));
180 accumulatore.msw |= 0x80000000;
181
182 div_Xsig(&accumulator, &accumulatore, &accumulator);
183
184 mul_Xsig_Xsig(&accumulator, &argSignif);
185 mul_Xsig_Xsig(&accumulator, &argSq);
186
187 shr_Xsig(&accumulator, 3);
188 negate_Xsig(&accumulator);
189 add_Xsig_Xsig(&accumulator, &argSignif);
190
191 if ( transformed )
192 {
193 /* compute pi/4 - accumulator */
194 shr_Xsig(&accumulator, -1-exponent);
195 negate_Xsig(&accumulator);
196 add_Xsig_Xsig(&accumulator, &pi_signif);
197 exponent = -1;
198 }
199
200 if ( inverted )
201 {
202 /* compute pi/2 - accumulator */
203 shr_Xsig(&accumulator, -exponent);
204 negate_Xsig(&accumulator);
205 add_Xsig_Xsig(&accumulator, &pi_signif);
206 exponent = 0;
207 }
208
209 if ( sign1 )
210 {
211 /* compute pi - accumulator */
212 shr_Xsig(&accumulator, 1 - exponent);
213 negate_Xsig(&accumulator);
214 add_Xsig_Xsig(&accumulator, &pi_signif);
215 exponent = 1;
216 }
217
218 exponent += round_Xsig(&accumulator);
219
220 significand(st1_ptr) = XSIG_LL(accumulator);
221 setexponent16(st1_ptr, exponent);
222
223 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
224 FPU_settagi(1, tag);
225
226 set_precision_flag_up(); /* We do not really know if up or down,
227 use this as the default. */
228 207
229} 208}
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c
index dd00e1d5b074..8e2ff4b28a0a 100644
--- a/arch/x86/math-emu/poly_l2.c
+++ b/arch/x86/math-emu/poly_l2.c
@@ -10,7 +10,6 @@
10 | | 10 | |
11 +---------------------------------------------------------------------------*/ 11 +---------------------------------------------------------------------------*/
12 12
13
14#include "exception.h" 13#include "exception.h"
15#include "reg_constant.h" 14#include "reg_constant.h"
16#include "fpu_emu.h" 15#include "fpu_emu.h"
@@ -18,184 +17,163 @@
18#include "control_w.h" 17#include "control_w.h"
19#include "poly.h" 18#include "poly.h"
20 19
21
22static void log2_kernel(FPU_REG const *arg, u_char argsign, 20static void log2_kernel(FPU_REG const *arg, u_char argsign,
23 Xsig *accum_result, long int *expon); 21 Xsig * accum_result, long int *expon);
24
25 22
26/*--- poly_l2() -------------------------------------------------------------+ 23/*--- poly_l2() -------------------------------------------------------------+
27 | Base 2 logarithm by a polynomial approximation. | 24 | Base 2 logarithm by a polynomial approximation. |
28 +---------------------------------------------------------------------------*/ 25 +---------------------------------------------------------------------------*/
29void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) 26void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
30{ 27{
31 long int exponent, expon, expon_expon; 28 long int exponent, expon, expon_expon;
32 Xsig accumulator, expon_accum, yaccum; 29 Xsig accumulator, expon_accum, yaccum;
33 u_char sign, argsign; 30 u_char sign, argsign;
34 FPU_REG x; 31 FPU_REG x;
35 int tag; 32 int tag;
36 33
37 exponent = exponent16(st0_ptr); 34 exponent = exponent16(st0_ptr);
38 35
39 /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ 36 /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
40 if ( st0_ptr->sigh > (unsigned)0xb504f334 ) 37 if (st0_ptr->sigh > (unsigned)0xb504f334) {
41 { 38 /* Treat as sqrt(2)/2 < st0_ptr < 1 */
42 /* Treat as sqrt(2)/2 < st0_ptr < 1 */ 39 significand(&x) = -significand(st0_ptr);
43 significand(&x) = - significand(st0_ptr); 40 setexponent16(&x, -1);
44 setexponent16(&x, -1); 41 exponent++;
45 exponent++; 42 argsign = SIGN_NEG;
46 argsign = SIGN_NEG; 43 } else {
47 } 44 /* Treat as 1 <= st0_ptr < sqrt(2) */
48 else 45 x.sigh = st0_ptr->sigh - 0x80000000;
49 { 46 x.sigl = st0_ptr->sigl;
50 /* Treat as 1 <= st0_ptr < sqrt(2) */ 47 setexponent16(&x, 0);
51 x.sigh = st0_ptr->sigh - 0x80000000; 48 argsign = SIGN_POS;
52 x.sigl = st0_ptr->sigl; 49 }
53 setexponent16(&x, 0); 50 tag = FPU_normalize_nuo(&x);
54 argsign = SIGN_POS;
55 }
56 tag = FPU_normalize_nuo(&x);
57
58 if ( tag == TAG_Zero )
59 {
60 expon = 0;
61 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
62 }
63 else
64 {
65 log2_kernel(&x, argsign, &accumulator, &expon);
66 }
67
68 if ( exponent < 0 )
69 {
70 sign = SIGN_NEG;
71 exponent = -exponent;
72 }
73 else
74 sign = SIGN_POS;
75 expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0;
76 if ( exponent )
77 {
78 expon_expon = 31 + norm_Xsig(&expon_accum);
79 shr_Xsig(&accumulator, expon_expon - expon);
80
81 if ( sign ^ argsign )
82 negate_Xsig(&accumulator);
83 add_Xsig_Xsig(&accumulator, &expon_accum);
84 }
85 else
86 {
87 expon_expon = expon;
88 sign = argsign;
89 }
90
91 yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr);
92 mul_Xsig_Xsig(&accumulator, &yaccum);
93
94 expon_expon += round_Xsig(&accumulator);
95
96 if ( accumulator.msw == 0 )
97 {
98 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
99 return;
100 }
101
102 significand(st1_ptr) = XSIG_LL(accumulator);
103 setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
104
105 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
106 FPU_settagi(1, tag);
107
108 set_precision_flag_up(); /* 80486 appears to always do this */
109
110 return;
111 51
112} 52 if (tag == TAG_Zero) {
53 expon = 0;
54 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
55 } else {
56 log2_kernel(&x, argsign, &accumulator, &expon);
57 }
58
59 if (exponent < 0) {
60 sign = SIGN_NEG;
61 exponent = -exponent;
62 } else
63 sign = SIGN_POS;
64 expon_accum.msw = exponent;
65 expon_accum.midw = expon_accum.lsw = 0;
66 if (exponent) {
67 expon_expon = 31 + norm_Xsig(&expon_accum);
68 shr_Xsig(&accumulator, expon_expon - expon);
69
70 if (sign ^ argsign)
71 negate_Xsig(&accumulator);
72 add_Xsig_Xsig(&accumulator, &expon_accum);
73 } else {
74 expon_expon = expon;
75 sign = argsign;
76 }
77
78 yaccum.lsw = 0;
79 XSIG_LL(yaccum) = significand(st1_ptr);
80 mul_Xsig_Xsig(&accumulator, &yaccum);
81
82 expon_expon += round_Xsig(&accumulator);
83
84 if (accumulator.msw == 0) {
85 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
86 return;
87 }
88
89 significand(st1_ptr) = XSIG_LL(accumulator);
90 setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
113 91
92 tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
93 FPU_settagi(1, tag);
94
95 set_precision_flag_up(); /* 80486 appears to always do this */
96
97 return;
98
99}
114 100
115/*--- poly_l2p1() -----------------------------------------------------------+ 101/*--- poly_l2p1() -----------------------------------------------------------+
116 | Base 2 logarithm by a polynomial approximation. | 102 | Base 2 logarithm by a polynomial approximation. |
117 | log2(x+1) | 103 | log2(x+1) |
118 +---------------------------------------------------------------------------*/ 104 +---------------------------------------------------------------------------*/
119int poly_l2p1(u_char sign0, u_char sign1, 105int poly_l2p1(u_char sign0, u_char sign1,
120 FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest) 106 FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest)
121{ 107{
122 u_char tag; 108 u_char tag;
123 long int exponent; 109 long int exponent;
124 Xsig accumulator, yaccum; 110 Xsig accumulator, yaccum;
125 111
126 if ( exponent16(st0_ptr) < 0 ) 112 if (exponent16(st0_ptr) < 0) {
127 { 113 log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
128 log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
129 114
130 yaccum.lsw = 0; 115 yaccum.lsw = 0;
131 XSIG_LL(yaccum) = significand(st1_ptr); 116 XSIG_LL(yaccum) = significand(st1_ptr);
132 mul_Xsig_Xsig(&accumulator, &yaccum); 117 mul_Xsig_Xsig(&accumulator, &yaccum);
133 118
134 exponent += round_Xsig(&accumulator); 119 exponent += round_Xsig(&accumulator);
135 120
136 exponent += exponent16(st1_ptr) + 1; 121 exponent += exponent16(st1_ptr) + 1;
137 if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER; 122 if (exponent < EXP_WAY_UNDER)
123 exponent = EXP_WAY_UNDER;
138 124
139 significand(dest) = XSIG_LL(accumulator); 125 significand(dest) = XSIG_LL(accumulator);
140 setexponent16(dest, exponent); 126 setexponent16(dest, exponent);
141 127
142 tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); 128 tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
143 FPU_settagi(1, tag); 129 FPU_settagi(1, tag);
144 130
145 if ( tag == TAG_Valid ) 131 if (tag == TAG_Valid)
146 set_precision_flag_up(); /* 80486 appears to always do this */ 132 set_precision_flag_up(); /* 80486 appears to always do this */
147 } 133 } else {
148 else 134 /* The magnitude of st0_ptr is far too large. */
149 {
150 /* The magnitude of st0_ptr is far too large. */
151 135
152 if ( sign0 != SIGN_POS ) 136 if (sign0 != SIGN_POS) {
153 { 137 /* Trying to get the log of a negative number. */
154 /* Trying to get the log of a negative number. */ 138#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
155#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ 139 changesign(st1_ptr);
156 changesign(st1_ptr);
157#else 140#else
158 if ( arith_invalid(1) < 0 ) 141 if (arith_invalid(1) < 0)
159 return 1; 142 return 1;
160#endif /* PECULIAR_486 */ 143#endif /* PECULIAR_486 */
161 } 144 }
162 145
163 /* 80486 appears to do this */ 146 /* 80486 appears to do this */
164 if ( sign0 == SIGN_NEG ) 147 if (sign0 == SIGN_NEG)
165 set_precision_flag_down(); 148 set_precision_flag_down();
166 else 149 else
167 set_precision_flag_up(); 150 set_precision_flag_up();
168 } 151 }
169 152
170 if ( exponent(dest) <= EXP_UNDER ) 153 if (exponent(dest) <= EXP_UNDER)
171 EXCEPTION(EX_Underflow); 154 EXCEPTION(EX_Underflow);
172 155
173 return 0; 156 return 0;
174 157
175} 158}
176 159
177
178
179
180#undef HIPOWER 160#undef HIPOWER
181#define HIPOWER 10 161#define HIPOWER 10
182static const unsigned long long logterms[HIPOWER] = 162static const unsigned long long logterms[HIPOWER] = {
183{ 163 0x2a8eca5705fc2ef0LL,
184 0x2a8eca5705fc2ef0LL, 164 0xf6384ee1d01febceLL,
185 0xf6384ee1d01febceLL, 165 0x093bb62877cdf642LL,
186 0x093bb62877cdf642LL, 166 0x006985d8a9ec439bLL,
187 0x006985d8a9ec439bLL, 167 0x0005212c4f55a9c8LL,
188 0x0005212c4f55a9c8LL, 168 0x00004326a16927f0LL,
189 0x00004326a16927f0LL, 169 0x0000038d1d80a0e7LL,
190 0x0000038d1d80a0e7LL, 170 0x0000003141cc80c6LL,
191 0x0000003141cc80c6LL, 171 0x00000002b1668c9fLL,
192 0x00000002b1668c9fLL, 172 0x000000002c7a46aaLL
193 0x000000002c7a46aaLL
194}; 173};
195 174
196static const unsigned long leadterm = 0xb8000000; 175static const unsigned long leadterm = 0xb8000000;
197 176
198
199/*--- log2_kernel() ---------------------------------------------------------+ 177/*--- log2_kernel() ---------------------------------------------------------+
200 | Base 2 logarithm by a polynomial approximation. | 178 | Base 2 logarithm by a polynomial approximation. |
201 | log2(x+1) | 179 | log2(x+1) |
@@ -203,70 +181,64 @@ static const unsigned long leadterm = 0xb8000000;
203static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result, 181static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
204 long int *expon) 182 long int *expon)
205{ 183{
206 long int exponent, adj; 184 long int exponent, adj;
207 unsigned long long Xsq; 185 unsigned long long Xsq;
208 Xsig accumulator, Numer, Denom, argSignif, arg_signif; 186 Xsig accumulator, Numer, Denom, argSignif, arg_signif;
209 187
210 exponent = exponent16(arg); 188 exponent = exponent16(arg);
211 Numer.lsw = Denom.lsw = 0; 189 Numer.lsw = Denom.lsw = 0;
212 XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); 190 XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
213 if ( argsign == SIGN_POS ) 191 if (argsign == SIGN_POS) {
214 { 192 shr_Xsig(&Denom, 2 - (1 + exponent));
215 shr_Xsig(&Denom, 2 - (1 + exponent)); 193 Denom.msw |= 0x80000000;
216 Denom.msw |= 0x80000000; 194 div_Xsig(&Numer, &Denom, &argSignif);
217 div_Xsig(&Numer, &Denom, &argSignif); 195 } else {
218 } 196 shr_Xsig(&Denom, 1 - (1 + exponent));
219 else 197 negate_Xsig(&Denom);
220 { 198 if (Denom.msw & 0x80000000) {
221 shr_Xsig(&Denom, 1 - (1 + exponent)); 199 div_Xsig(&Numer, &Denom, &argSignif);
222 negate_Xsig(&Denom); 200 exponent++;
223 if ( Denom.msw & 0x80000000 ) 201 } else {
224 { 202 /* Denom must be 1.0 */
225 div_Xsig(&Numer, &Denom, &argSignif); 203 argSignif.lsw = Numer.lsw;
226 exponent ++; 204 argSignif.midw = Numer.midw;
227 } 205 argSignif.msw = Numer.msw;
228 else 206 }
229 {
230 /* Denom must be 1.0 */
231 argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw;
232 argSignif.msw = Numer.msw;
233 } 207 }
234 }
235 208
236#ifndef PECULIAR_486 209#ifndef PECULIAR_486
237 /* Should check here that |local_arg| is within the valid range */ 210 /* Should check here that |local_arg| is within the valid range */
238 if ( exponent >= -2 ) 211 if (exponent >= -2) {
239 { 212 if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) {
240 if ( (exponent > -2) || 213 /* The argument is too large */
241 (argSignif.msw > (unsigned)0xafb0ccc0) ) 214 }
242 {
243 /* The argument is too large */
244 } 215 }
245 }
246#endif /* PECULIAR_486 */ 216#endif /* PECULIAR_486 */
247 217
248 arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif); 218 arg_signif.lsw = argSignif.lsw;
249 adj = norm_Xsig(&argSignif); 219 XSIG_LL(arg_signif) = XSIG_LL(argSignif);
250 accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif); 220 adj = norm_Xsig(&argSignif);
251 mul_Xsig_Xsig(&accumulator, &accumulator); 221 accumulator.lsw = argSignif.lsw;
252 shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj))); 222 XSIG_LL(accumulator) = XSIG_LL(argSignif);
253 Xsq = XSIG_LL(accumulator); 223 mul_Xsig_Xsig(&accumulator, &accumulator);
254 if ( accumulator.lsw & 0x80000000 ) 224 shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj)));
255 Xsq++; 225 Xsq = XSIG_LL(accumulator);
256 226 if (accumulator.lsw & 0x80000000)
257 accumulator.msw = accumulator.midw = accumulator.lsw = 0; 227 Xsq++;
258 /* Do the basic fixed point polynomial evaluation */ 228
259 polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1); 229 accumulator.msw = accumulator.midw = accumulator.lsw = 0;
260 230 /* Do the basic fixed point polynomial evaluation */
261 mul_Xsig_Xsig(&accumulator, &argSignif); 231 polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1);
262 shr_Xsig(&accumulator, 6 - adj); 232
263 233 mul_Xsig_Xsig(&accumulator, &argSignif);
264 mul32_Xsig(&arg_signif, leadterm); 234 shr_Xsig(&accumulator, 6 - adj);
265 add_two_Xsig(&accumulator, &arg_signif, &exponent); 235
266 236 mul32_Xsig(&arg_signif, leadterm);
267 *expon = exponent + 1; 237 add_two_Xsig(&accumulator, &arg_signif, &exponent);
268 accum_result->lsw = accumulator.lsw; 238
269 accum_result->midw = accumulator.midw; 239 *expon = exponent + 1;
270 accum_result->msw = accumulator.msw; 240 accum_result->lsw = accumulator.lsw;
241 accum_result->midw = accumulator.midw;
242 accum_result->msw = accumulator.msw;
271 243
272} 244}
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c
index a36313fb06f1..b862039c728e 100644
--- a/arch/x86/math-emu/poly_sin.c
+++ b/arch/x86/math-emu/poly_sin.c
@@ -11,7 +11,6 @@
11 | | 11 | |
12 +---------------------------------------------------------------------------*/ 12 +---------------------------------------------------------------------------*/
13 13
14
15#include "exception.h" 14#include "exception.h"
16#include "reg_constant.h" 15#include "reg_constant.h"
17#include "fpu_emu.h" 16#include "fpu_emu.h"
@@ -19,379 +18,361 @@
19#include "control_w.h" 18#include "control_w.h"
20#include "poly.h" 19#include "poly.h"
21 20
22
23#define N_COEFF_P 4 21#define N_COEFF_P 4
24#define N_COEFF_N 4 22#define N_COEFF_N 4
25 23
26static const unsigned long long pos_terms_l[N_COEFF_P] = 24static const unsigned long long pos_terms_l[N_COEFF_P] = {
27{ 25 0xaaaaaaaaaaaaaaabLL,
28 0xaaaaaaaaaaaaaaabLL, 26 0x00d00d00d00cf906LL,
29 0x00d00d00d00cf906LL, 27 0x000006b99159a8bbLL,
30 0x000006b99159a8bbLL, 28 0x000000000d7392e6LL
31 0x000000000d7392e6LL
32}; 29};
33 30
34static const unsigned long long neg_terms_l[N_COEFF_N] = 31static const unsigned long long neg_terms_l[N_COEFF_N] = {
35{ 32 0x2222222222222167LL,
36 0x2222222222222167LL, 33 0x0002e3bc74aab624LL,
37 0x0002e3bc74aab624LL, 34 0x0000000b09229062LL,
38 0x0000000b09229062LL, 35 0x00000000000c7973LL
39 0x00000000000c7973LL
40}; 36};
41 37
42
43
44#define N_COEFF_PH 4 38#define N_COEFF_PH 4
45#define N_COEFF_NH 4 39#define N_COEFF_NH 4
46static const unsigned long long pos_terms_h[N_COEFF_PH] = 40static const unsigned long long pos_terms_h[N_COEFF_PH] = {
47{ 41 0x0000000000000000LL,
48 0x0000000000000000LL, 42 0x05b05b05b05b0406LL,
49 0x05b05b05b05b0406LL, 43 0x000049f93edd91a9LL,
50 0x000049f93edd91a9LL, 44 0x00000000c9c9ed62LL
51 0x00000000c9c9ed62LL
52}; 45};
53 46
54static const unsigned long long neg_terms_h[N_COEFF_NH] = 47static const unsigned long long neg_terms_h[N_COEFF_NH] = {
55{ 48 0xaaaaaaaaaaaaaa98LL,
56 0xaaaaaaaaaaaaaa98LL, 49 0x001a01a01a019064LL,
57 0x001a01a01a019064LL, 50 0x0000008f76c68a77LL,
58 0x0000008f76c68a77LL, 51 0x0000000000d58f5eLL
59 0x0000000000d58f5eLL
60}; 52};
61 53
62
63/*--- poly_sine() -----------------------------------------------------------+ 54/*--- poly_sine() -----------------------------------------------------------+
64 | | 55 | |
65 +---------------------------------------------------------------------------*/ 56 +---------------------------------------------------------------------------*/
66void poly_sine(FPU_REG *st0_ptr) 57void poly_sine(FPU_REG *st0_ptr)
67{ 58{
68 int exponent, echange; 59 int exponent, echange;
69 Xsig accumulator, argSqrd, argTo4; 60 Xsig accumulator, argSqrd, argTo4;
70 unsigned long fix_up, adj; 61 unsigned long fix_up, adj;
71 unsigned long long fixed_arg; 62 unsigned long long fixed_arg;
72 FPU_REG result; 63 FPU_REG result;
73 64
74 exponent = exponent(st0_ptr); 65 exponent = exponent(st0_ptr);
75 66
76 accumulator.lsw = accumulator.midw = accumulator.msw = 0; 67 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
77 68
78 /* Split into two ranges, for arguments below and above 1.0 */ 69 /* Split into two ranges, for arguments below and above 1.0 */
79 /* The boundary between upper and lower is approx 0.88309101259 */ 70 /* The boundary between upper and lower is approx 0.88309101259 */
80 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) ) 71 if ((exponent < -1)
81 { 72 || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) {
82 /* The argument is <= 0.88309101259 */ 73 /* The argument is <= 0.88309101259 */
74
75 argSqrd.msw = st0_ptr->sigh;
76 argSqrd.midw = st0_ptr->sigl;
77 argSqrd.lsw = 0;
78 mul64_Xsig(&argSqrd, &significand(st0_ptr));
79 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
80 argTo4.msw = argSqrd.msw;
81 argTo4.midw = argSqrd.midw;
82 argTo4.lsw = argSqrd.lsw;
83 mul_Xsig_Xsig(&argTo4, &argTo4);
83 84
84 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0; 85 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
85 mul64_Xsig(&argSqrd, &significand(st0_ptr)); 86 N_COEFF_N - 1);
86 shr_Xsig(&argSqrd, 2*(-1-exponent)); 87 mul_Xsig_Xsig(&accumulator, &argSqrd);
87 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; 88 negate_Xsig(&accumulator);
88 argTo4.lsw = argSqrd.lsw;
89 mul_Xsig_Xsig(&argTo4, &argTo4);
90 89
91 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, 90 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
92 N_COEFF_N-1); 91 N_COEFF_P - 1);
93 mul_Xsig_Xsig(&accumulator, &argSqrd);
94 negate_Xsig(&accumulator);
95 92
96 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, 93 shr_Xsig(&accumulator, 2); /* Divide by four */
97 N_COEFF_P-1); 94 accumulator.msw |= 0x80000000; /* Add 1.0 */
98 95
99 shr_Xsig(&accumulator, 2); /* Divide by four */ 96 mul64_Xsig(&accumulator, &significand(st0_ptr));
100 accumulator.msw |= 0x80000000; /* Add 1.0 */ 97 mul64_Xsig(&accumulator, &significand(st0_ptr));
98 mul64_Xsig(&accumulator, &significand(st0_ptr));
101 99
102 mul64_Xsig(&accumulator, &significand(st0_ptr)); 100 /* Divide by four, FPU_REG compatible, etc */
103 mul64_Xsig(&accumulator, &significand(st0_ptr)); 101 exponent = 3 * exponent;
104 mul64_Xsig(&accumulator, &significand(st0_ptr));
105 102
106 /* Divide by four, FPU_REG compatible, etc */ 103 /* The minimum exponent difference is 3 */
107 exponent = 3*exponent; 104 shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
108 105
109 /* The minimum exponent difference is 3 */ 106 negate_Xsig(&accumulator);
110 shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); 107 XSIG_LL(accumulator) += significand(st0_ptr);
111 108
112 negate_Xsig(&accumulator); 109 echange = round_Xsig(&accumulator);
113 XSIG_LL(accumulator) += significand(st0_ptr);
114 110
115 echange = round_Xsig(&accumulator); 111 setexponentpos(&result, exponent(st0_ptr) + echange);
112 } else {
113 /* The argument is > 0.88309101259 */
114 /* We use sin(st(0)) = cos(pi/2-st(0)) */
116 115
117 setexponentpos(&result, exponent(st0_ptr) + echange); 116 fixed_arg = significand(st0_ptr);
118 }
119 else
120 {
121 /* The argument is > 0.88309101259 */
122 /* We use sin(st(0)) = cos(pi/2-st(0)) */
123 117
124 fixed_arg = significand(st0_ptr); 118 if (exponent == 0) {
119 /* The argument is >= 1.0 */
125 120
126 if ( exponent == 0 ) 121 /* Put the binary point at the left. */
127 { 122 fixed_arg <<= 1;
128 /* The argument is >= 1.0 */ 123 }
124 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
125 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
126 /* There is a special case which arises due to rounding, to fix here. */
127 if (fixed_arg == 0xffffffffffffffffLL)
128 fixed_arg = 0;
129 129
130 /* Put the binary point at the left. */ 130 XSIG_LL(argSqrd) = fixed_arg;
131 fixed_arg <<= 1; 131 argSqrd.lsw = 0;
132 } 132 mul64_Xsig(&argSqrd, &fixed_arg);
133 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
134 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
135 /* There is a special case which arises due to rounding, to fix here. */
136 if ( fixed_arg == 0xffffffffffffffffLL )
137 fixed_arg = 0;
138 133
139 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; 134 XSIG_LL(argTo4) = XSIG_LL(argSqrd);
140 mul64_Xsig(&argSqrd, &fixed_arg); 135 argTo4.lsw = argSqrd.lsw;
136 mul_Xsig_Xsig(&argTo4, &argTo4);
141 137
142 XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw; 138 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
143 mul_Xsig_Xsig(&argTo4, &argTo4); 139 N_COEFF_NH - 1);
140 mul_Xsig_Xsig(&accumulator, &argSqrd);
141 negate_Xsig(&accumulator);
144 142
145 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, 143 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
146 N_COEFF_NH-1); 144 N_COEFF_PH - 1);
147 mul_Xsig_Xsig(&accumulator, &argSqrd); 145 negate_Xsig(&accumulator);
148 negate_Xsig(&accumulator);
149 146
150 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, 147 mul64_Xsig(&accumulator, &fixed_arg);
151 N_COEFF_PH-1); 148 mul64_Xsig(&accumulator, &fixed_arg);
152 negate_Xsig(&accumulator);
153 149
154 mul64_Xsig(&accumulator, &fixed_arg); 150 shr_Xsig(&accumulator, 3);
155 mul64_Xsig(&accumulator, &fixed_arg); 151 negate_Xsig(&accumulator);
156 152
157 shr_Xsig(&accumulator, 3); 153 add_Xsig_Xsig(&accumulator, &argSqrd);
158 negate_Xsig(&accumulator);
159 154
160 add_Xsig_Xsig(&accumulator, &argSqrd); 155 shr_Xsig(&accumulator, 1);
161 156
162 shr_Xsig(&accumulator, 1); 157 accumulator.lsw |= 1; /* A zero accumulator here would cause problems */
158 negate_Xsig(&accumulator);
163 159
164 accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ 160 /* The basic computation is complete. Now fix the answer to
165 negate_Xsig(&accumulator); 161 compensate for the error due to the approximation used for
162 pi/2
163 */
166 164
167 /* The basic computation is complete. Now fix the answer to 165 /* This has an exponent of -65 */
168 compensate for the error due to the approximation used for 166 fix_up = 0x898cc517;
169 pi/2 167 /* The fix-up needs to be improved for larger args */
170 */ 168 if (argSqrd.msw & 0xffc00000) {
169 /* Get about 32 bit precision in these: */
170 fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
171 }
172 fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
171 173
172 /* This has an exponent of -65 */ 174 adj = accumulator.lsw; /* temp save */
173 fix_up = 0x898cc517; 175 accumulator.lsw -= fix_up;
174 /* The fix-up needs to be improved for larger args */ 176 if (accumulator.lsw > adj)
175 if ( argSqrd.msw & 0xffc00000 ) 177 XSIG_LL(accumulator)--;
176 {
177 /* Get about 32 bit precision in these: */
178 fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
179 }
180 fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
181 178
182 adj = accumulator.lsw; /* temp save */ 179 echange = round_Xsig(&accumulator);
183 accumulator.lsw -= fix_up;
184 if ( accumulator.lsw > adj )
185 XSIG_LL(accumulator) --;
186 180
187 echange = round_Xsig(&accumulator); 181 setexponentpos(&result, echange - 1);
188 182 }
189 setexponentpos(&result, echange - 1);
190 }
191 183
192 significand(&result) = XSIG_LL(accumulator); 184 significand(&result) = XSIG_LL(accumulator);
193 setsign(&result, getsign(st0_ptr)); 185 setsign(&result, getsign(st0_ptr));
194 FPU_copy_to_reg0(&result, TAG_Valid); 186 FPU_copy_to_reg0(&result, TAG_Valid);
195 187
196#ifdef PARANOID 188#ifdef PARANOID
197 if ( (exponent(&result) >= 0) 189 if ((exponent(&result) >= 0)
198 && (significand(&result) > 0x8000000000000000LL) ) 190 && (significand(&result) > 0x8000000000000000LL)) {
199 { 191 EXCEPTION(EX_INTERNAL | 0x150);
200 EXCEPTION(EX_INTERNAL|0x150); 192 }
201 }
202#endif /* PARANOID */ 193#endif /* PARANOID */
203 194
204} 195}
205 196
206
207
208/*--- poly_cos() ------------------------------------------------------------+ 197/*--- poly_cos() ------------------------------------------------------------+
209 | | 198 | |
210 +---------------------------------------------------------------------------*/ 199 +---------------------------------------------------------------------------*/
211void poly_cos(FPU_REG *st0_ptr) 200void poly_cos(FPU_REG *st0_ptr)
212{ 201{
213 FPU_REG result; 202 FPU_REG result;
214 long int exponent, exp2, echange; 203 long int exponent, exp2, echange;
215 Xsig accumulator, argSqrd, fix_up, argTo4; 204 Xsig accumulator, argSqrd, fix_up, argTo4;
216 unsigned long long fixed_arg; 205 unsigned long long fixed_arg;
217 206
218#ifdef PARANOID 207#ifdef PARANOID
219 if ( (exponent(st0_ptr) > 0) 208 if ((exponent(st0_ptr) > 0)
220 || ((exponent(st0_ptr) == 0) 209 || ((exponent(st0_ptr) == 0)
221 && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) ) 210 && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) {
222 { 211 EXCEPTION(EX_Invalid);
223 EXCEPTION(EX_Invalid); 212 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
224 FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); 213 return;
225 return;
226 }
227#endif /* PARANOID */
228
229 exponent = exponent(st0_ptr);
230
231 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
232
233 if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) )
234 {
235 /* arg is < 0.687705 */
236
237 argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl;
238 argSqrd.lsw = 0;
239 mul64_Xsig(&argSqrd, &significand(st0_ptr));
240
241 if ( exponent < -1 )
242 {
243 /* shift the argument right by the required places */
244 shr_Xsig(&argSqrd, 2*(-1-exponent));
245 }
246
247 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
248 argTo4.lsw = argSqrd.lsw;
249 mul_Xsig_Xsig(&argTo4, &argTo4);
250
251 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
252 N_COEFF_NH-1);
253 mul_Xsig_Xsig(&accumulator, &argSqrd);
254 negate_Xsig(&accumulator);
255
256 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
257 N_COEFF_PH-1);
258 negate_Xsig(&accumulator);
259
260 mul64_Xsig(&accumulator, &significand(st0_ptr));
261 mul64_Xsig(&accumulator, &significand(st0_ptr));
262 shr_Xsig(&accumulator, -2*(1+exponent));
263
264 shr_Xsig(&accumulator, 3);
265 negate_Xsig(&accumulator);
266
267 add_Xsig_Xsig(&accumulator, &argSqrd);
268
269 shr_Xsig(&accumulator, 1);
270
271 /* It doesn't matter if accumulator is all zero here, the
272 following code will work ok */
273 negate_Xsig(&accumulator);
274
275 if ( accumulator.lsw & 0x80000000 )
276 XSIG_LL(accumulator) ++;
277 if ( accumulator.msw == 0 )
278 {
279 /* The result is 1.0 */
280 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
281 return;
282 }
283 else
284 {
285 significand(&result) = XSIG_LL(accumulator);
286
287 /* will be a valid positive nr with expon = -1 */
288 setexponentpos(&result, -1);
289 }
290 }
291 else
292 {
293 fixed_arg = significand(st0_ptr);
294
295 if ( exponent == 0 )
296 {
297 /* The argument is >= 1.0 */
298
299 /* Put the binary point at the left. */
300 fixed_arg <<= 1;
301 }
302 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
303 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
304 /* There is a special case which arises due to rounding, to fix here. */
305 if ( fixed_arg == 0xffffffffffffffffLL )
306 fixed_arg = 0;
307
308 exponent = -1;
309 exp2 = -1;
310
311 /* A shift is needed here only for a narrow range of arguments,
312 i.e. for fixed_arg approx 2^-32, but we pick up more... */
313 if ( !(LL_MSW(fixed_arg) & 0xffff0000) )
314 {
315 fixed_arg <<= 16;
316 exponent -= 16;
317 exp2 -= 16;
318 } 214 }
215#endif /* PARANOID */
319 216
320 XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; 217 exponent = exponent(st0_ptr);
321 mul64_Xsig(&argSqrd, &fixed_arg); 218
322 219 accumulator.lsw = accumulator.midw = accumulator.msw = 0;
323 if ( exponent < -1 ) 220
324 { 221 if ((exponent < -1)
325 /* shift the argument right by the required places */ 222 || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) {
326 shr_Xsig(&argSqrd, 2*(-1-exponent)); 223 /* arg is < 0.687705 */
327 } 224
328 225 argSqrd.msw = st0_ptr->sigh;
329 argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; 226 argSqrd.midw = st0_ptr->sigl;
330 argTo4.lsw = argSqrd.lsw; 227 argSqrd.lsw = 0;
331 mul_Xsig_Xsig(&argTo4, &argTo4); 228 mul64_Xsig(&argSqrd, &significand(st0_ptr));
332 229
333 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, 230 if (exponent < -1) {
334 N_COEFF_N-1); 231 /* shift the argument right by the required places */
335 mul_Xsig_Xsig(&accumulator, &argSqrd); 232 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
336 negate_Xsig(&accumulator); 233 }
337 234
338 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, 235 argTo4.msw = argSqrd.msw;
339 N_COEFF_P-1); 236 argTo4.midw = argSqrd.midw;
340 237 argTo4.lsw = argSqrd.lsw;
341 shr_Xsig(&accumulator, 2); /* Divide by four */ 238 mul_Xsig_Xsig(&argTo4, &argTo4);
342 accumulator.msw |= 0x80000000; /* Add 1.0 */ 239
343 240 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
344 mul64_Xsig(&accumulator, &fixed_arg); 241 N_COEFF_NH - 1);
345 mul64_Xsig(&accumulator, &fixed_arg); 242 mul_Xsig_Xsig(&accumulator, &argSqrd);
346 mul64_Xsig(&accumulator, &fixed_arg); 243 negate_Xsig(&accumulator);
347 244
348 /* Divide by four, FPU_REG compatible, etc */ 245 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
349 exponent = 3*exponent; 246 N_COEFF_PH - 1);
350 247 negate_Xsig(&accumulator);
351 /* The minimum exponent difference is 3 */ 248
352 shr_Xsig(&accumulator, exp2 - exponent); 249 mul64_Xsig(&accumulator, &significand(st0_ptr));
353 250 mul64_Xsig(&accumulator, &significand(st0_ptr));
354 negate_Xsig(&accumulator); 251 shr_Xsig(&accumulator, -2 * (1 + exponent));
355 XSIG_LL(accumulator) += fixed_arg; 252
356 253 shr_Xsig(&accumulator, 3);
357 /* The basic computation is complete. Now fix the answer to 254 negate_Xsig(&accumulator);
358 compensate for the error due to the approximation used for 255
359 pi/2 256 add_Xsig_Xsig(&accumulator, &argSqrd);
360 */ 257
361 258 shr_Xsig(&accumulator, 1);
362 /* This has an exponent of -65 */ 259
363 XSIG_LL(fix_up) = 0x898cc51701b839a2ll; 260 /* It doesn't matter if accumulator is all zero here, the
364 fix_up.lsw = 0; 261 following code will work ok */
365 262 negate_Xsig(&accumulator);
366 /* The fix-up needs to be improved for larger args */ 263
367 if ( argSqrd.msw & 0xffc00000 ) 264 if (accumulator.lsw & 0x80000000)
368 { 265 XSIG_LL(accumulator)++;
369 /* Get about 32 bit precision in these: */ 266 if (accumulator.msw == 0) {
370 fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; 267 /* The result is 1.0 */
371 fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; 268 FPU_copy_to_reg0(&CONST_1, TAG_Valid);
269 return;
270 } else {
271 significand(&result) = XSIG_LL(accumulator);
272
273 /* will be a valid positive nr with expon = -1 */
274 setexponentpos(&result, -1);
275 }
276 } else {
277 fixed_arg = significand(st0_ptr);
278
279 if (exponent == 0) {
280 /* The argument is >= 1.0 */
281
282 /* Put the binary point at the left. */
283 fixed_arg <<= 1;
284 }
285 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
286 fixed_arg = 0x921fb54442d18469LL - fixed_arg;
287 /* There is a special case which arises due to rounding, to fix here. */
288 if (fixed_arg == 0xffffffffffffffffLL)
289 fixed_arg = 0;
290
291 exponent = -1;
292 exp2 = -1;
293
294 /* A shift is needed here only for a narrow range of arguments,
295 i.e. for fixed_arg approx 2^-32, but we pick up more... */
296 if (!(LL_MSW(fixed_arg) & 0xffff0000)) {
297 fixed_arg <<= 16;
298 exponent -= 16;
299 exp2 -= 16;
300 }
301
302 XSIG_LL(argSqrd) = fixed_arg;
303 argSqrd.lsw = 0;
304 mul64_Xsig(&argSqrd, &fixed_arg);
305
306 if (exponent < -1) {
307 /* shift the argument right by the required places */
308 shr_Xsig(&argSqrd, 2 * (-1 - exponent));
309 }
310
311 argTo4.msw = argSqrd.msw;
312 argTo4.midw = argSqrd.midw;
313 argTo4.lsw = argSqrd.lsw;
314 mul_Xsig_Xsig(&argTo4, &argTo4);
315
316 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
317 N_COEFF_N - 1);
318 mul_Xsig_Xsig(&accumulator, &argSqrd);
319 negate_Xsig(&accumulator);
320
321 polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
322 N_COEFF_P - 1);
323
324 shr_Xsig(&accumulator, 2); /* Divide by four */
325 accumulator.msw |= 0x80000000; /* Add 1.0 */
326
327 mul64_Xsig(&accumulator, &fixed_arg);
328 mul64_Xsig(&accumulator, &fixed_arg);
329 mul64_Xsig(&accumulator, &fixed_arg);
330
331 /* Divide by four, FPU_REG compatible, etc */
332 exponent = 3 * exponent;
333
334 /* The minimum exponent difference is 3 */
335 shr_Xsig(&accumulator, exp2 - exponent);
336
337 negate_Xsig(&accumulator);
338 XSIG_LL(accumulator) += fixed_arg;
339
340 /* The basic computation is complete. Now fix the answer to
341 compensate for the error due to the approximation used for
342 pi/2
343 */
344
345 /* This has an exponent of -65 */
346 XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
347 fix_up.lsw = 0;
348
349 /* The fix-up needs to be improved for larger args */
350 if (argSqrd.msw & 0xffc00000) {
351 /* Get about 32 bit precision in these: */
352 fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
353 fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
354 }
355
356 exp2 += norm_Xsig(&accumulator);
357 shr_Xsig(&accumulator, 1); /* Prevent overflow */
358 exp2++;
359 shr_Xsig(&fix_up, 65 + exp2);
360
361 add_Xsig_Xsig(&accumulator, &fix_up);
362
363 echange = round_Xsig(&accumulator);
364
365 setexponentpos(&result, exp2 + echange);
366 significand(&result) = XSIG_LL(accumulator);
372 } 367 }
373 368
374 exp2 += norm_Xsig(&accumulator); 369 FPU_copy_to_reg0(&result, TAG_Valid);
375 shr_Xsig(&accumulator, 1); /* Prevent overflow */
376 exp2++;
377 shr_Xsig(&fix_up, 65 + exp2);
378
379 add_Xsig_Xsig(&accumulator, &fix_up);
380
381 echange = round_Xsig(&accumulator);
382
383 setexponentpos(&result, exp2 + echange);
384 significand(&result) = XSIG_LL(accumulator);
385 }
386
387 FPU_copy_to_reg0(&result, TAG_Valid);
388 370
389#ifdef PARANOID 371#ifdef PARANOID
390 if ( (exponent(&result) >= 0) 372 if ((exponent(&result) >= 0)
391 && (significand(&result) > 0x8000000000000000LL) ) 373 && (significand(&result) > 0x8000000000000000LL)) {
392 { 374 EXCEPTION(EX_INTERNAL | 0x151);
393 EXCEPTION(EX_INTERNAL|0x151); 375 }
394 }
395#endif /* PARANOID */ 376#endif /* PARANOID */
396 377
397} 378}
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c
index 8df3e03b6e6f..1875763e0c02 100644
--- a/arch/x86/math-emu/poly_tan.c
+++ b/arch/x86/math-emu/poly_tan.c
@@ -17,206 +17,196 @@
17#include "control_w.h" 17#include "control_w.h"
18#include "poly.h" 18#include "poly.h"
19 19
20
21#define HiPOWERop 3 /* odd poly, positive terms */ 20#define HiPOWERop 3 /* odd poly, positive terms */
22static const unsigned long long oddplterm[HiPOWERop] = 21static const unsigned long long oddplterm[HiPOWERop] = {
23{ 22 0x0000000000000000LL,
24 0x0000000000000000LL, 23 0x0051a1cf08fca228LL,
25 0x0051a1cf08fca228LL, 24 0x0000000071284ff7LL
26 0x0000000071284ff7LL
27}; 25};
28 26
29#define HiPOWERon 2 /* odd poly, negative terms */ 27#define HiPOWERon 2 /* odd poly, negative terms */
30static const unsigned long long oddnegterm[HiPOWERon] = 28static const unsigned long long oddnegterm[HiPOWERon] = {
31{ 29 0x1291a9a184244e80LL,
32 0x1291a9a184244e80LL, 30 0x0000583245819c21LL
33 0x0000583245819c21LL
34}; 31};
35 32
36#define HiPOWERep 2 /* even poly, positive terms */ 33#define HiPOWERep 2 /* even poly, positive terms */
37static const unsigned long long evenplterm[HiPOWERep] = 34static const unsigned long long evenplterm[HiPOWERep] = {
38{ 35 0x0e848884b539e888LL,
39 0x0e848884b539e888LL, 36 0x00003c7f18b887daLL
40 0x00003c7f18b887daLL
41}; 37};
42 38
43#define HiPOWERen 2 /* even poly, negative terms */ 39#define HiPOWERen 2 /* even poly, negative terms */
44static const unsigned long long evennegterm[HiPOWERen] = 40static const unsigned long long evennegterm[HiPOWERen] = {
45{ 41 0xf1f0200fd51569ccLL,
46 0xf1f0200fd51569ccLL, 42 0x003afb46105c4432LL
47 0x003afb46105c4432LL
48}; 43};
49 44
50static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL; 45static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
51 46
52
53/*--- poly_tan() ------------------------------------------------------------+ 47/*--- poly_tan() ------------------------------------------------------------+
54 | | 48 | |
55 +---------------------------------------------------------------------------*/ 49 +---------------------------------------------------------------------------*/
56void poly_tan(FPU_REG *st0_ptr) 50void poly_tan(FPU_REG *st0_ptr)
57{ 51{
58 long int exponent; 52 long int exponent;
59 int invert; 53 int invert;
60 Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, 54 Xsig argSq, argSqSq, accumulatoro, accumulatore, accum,
61 argSignif, fix_up; 55 argSignif, fix_up;
62 unsigned long adj; 56 unsigned long adj;
63 57
64 exponent = exponent(st0_ptr); 58 exponent = exponent(st0_ptr);
65 59
66#ifdef PARANOID 60#ifdef PARANOID
67 if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */ 61 if (signnegative(st0_ptr)) { /* Can't hack a number < 0.0 */
68 { arith_invalid(0); return; } /* Need a positive number */ 62 arith_invalid(0);
63 return;
64 } /* Need a positive number */
69#endif /* PARANOID */ 65#endif /* PARANOID */
70 66
71 /* Split the problem into two domains, smaller and larger than pi/4 */ 67 /* Split the problem into two domains, smaller and larger than pi/4 */
72 if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) ) 68 if ((exponent == 0)
73 { 69 || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) {
74 /* The argument is greater than (approx) pi/4 */ 70 /* The argument is greater than (approx) pi/4 */
75 invert = 1; 71 invert = 1;
76 accum.lsw = 0; 72 accum.lsw = 0;
77 XSIG_LL(accum) = significand(st0_ptr); 73 XSIG_LL(accum) = significand(st0_ptr);
78 74
79 if ( exponent == 0 ) 75 if (exponent == 0) {
80 { 76 /* The argument is >= 1.0 */
81 /* The argument is >= 1.0 */ 77 /* Put the binary point at the left. */
82 /* Put the binary point at the left. */ 78 XSIG_LL(accum) <<= 1;
83 XSIG_LL(accum) <<= 1; 79 }
84 } 80 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
85 /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ 81 XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
86 XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); 82 /* This is a special case which arises due to rounding. */
87 /* This is a special case which arises due to rounding. */ 83 if (XSIG_LL(accum) == 0xffffffffffffffffLL) {
88 if ( XSIG_LL(accum) == 0xffffffffffffffffLL ) 84 FPU_settag0(TAG_Valid);
89 { 85 significand(st0_ptr) = 0x8a51e04daabda360LL;
90 FPU_settag0(TAG_Valid); 86 setexponent16(st0_ptr,
91 significand(st0_ptr) = 0x8a51e04daabda360LL; 87 (0x41 + EXTENDED_Ebias) | SIGN_Negative);
92 setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative); 88 return;
93 return; 89 }
90
91 argSignif.lsw = accum.lsw;
92 XSIG_LL(argSignif) = XSIG_LL(accum);
93 exponent = -1 + norm_Xsig(&argSignif);
94 } else {
95 invert = 0;
96 argSignif.lsw = 0;
97 XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
98
99 if (exponent < -1) {
100 /* shift the argument right by the required places */
101 if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >=
102 0x80000000U)
103 XSIG_LL(accum)++; /* round up */
104 }
94 } 105 }
95 106
96 argSignif.lsw = accum.lsw; 107 XSIG_LL(argSq) = XSIG_LL(accum);
97 XSIG_LL(argSignif) = XSIG_LL(accum); 108 argSq.lsw = accum.lsw;
98 exponent = -1 + norm_Xsig(&argSignif); 109 mul_Xsig_Xsig(&argSq, &argSq);
99 } 110 XSIG_LL(argSqSq) = XSIG_LL(argSq);
100 else 111 argSqSq.lsw = argSq.lsw;
101 { 112 mul_Xsig_Xsig(&argSqSq, &argSqSq);
102 invert = 0; 113
103 argSignif.lsw = 0; 114 /* Compute the negative terms for the numerator polynomial */
104 XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); 115 accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
105 116 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm,
106 if ( exponent < -1 ) 117 HiPOWERon - 1);
107 { 118 mul_Xsig_Xsig(&accumulatoro, &argSq);
108 /* shift the argument right by the required places */ 119 negate_Xsig(&accumulatoro);
109 if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U ) 120 /* Add the positive terms */
110 XSIG_LL(accum) ++; /* round up */ 121 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm,
111 } 122 HiPOWERop - 1);
112 } 123
113 124 /* Compute the positive terms for the denominator polynomial */
114 XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw; 125 accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
115 mul_Xsig_Xsig(&argSq, &argSq); 126 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm,
116 XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw; 127 HiPOWERep - 1);
117 mul_Xsig_Xsig(&argSqSq, &argSqSq); 128 mul_Xsig_Xsig(&accumulatore, &argSq);
118 129 negate_Xsig(&accumulatore);
119 /* Compute the negative terms for the numerator polynomial */ 130 /* Add the negative terms */
120 accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; 131 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm,
121 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1); 132 HiPOWERen - 1);
122 mul_Xsig_Xsig(&accumulatoro, &argSq); 133 /* Multiply by arg^2 */
123 negate_Xsig(&accumulatoro); 134 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
124 /* Add the positive terms */ 135 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
125 polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1); 136 /* de-normalize and divide by 2 */
126 137 shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1);
127 138 negate_Xsig(&accumulatore); /* This does 1 - accumulator */
128 /* Compute the positive terms for the denominator polynomial */ 139
129 accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; 140 /* Now find the ratio. */
130 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1); 141 if (accumulatore.msw == 0) {
131 mul_Xsig_Xsig(&accumulatore, &argSq); 142 /* accumulatoro must contain 1.0 here, (actually, 0) but it
132 negate_Xsig(&accumulatore); 143 really doesn't matter what value we use because it will
133 /* Add the negative terms */ 144 have negligible effect in later calculations
134 polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1); 145 */
135 /* Multiply by arg^2 */ 146 XSIG_LL(accum) = 0x8000000000000000LL;
136 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); 147 accum.lsw = 0;
137 mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); 148 } else {
138 /* de-normalize and divide by 2 */ 149 div_Xsig(&accumulatoro, &accumulatore, &accum);
139 shr_Xsig(&accumulatore, -2*(1+exponent) + 1);
140 negate_Xsig(&accumulatore); /* This does 1 - accumulator */
141
142 /* Now find the ratio. */
143 if ( accumulatore.msw == 0 )
144 {
145 /* accumulatoro must contain 1.0 here, (actually, 0) but it
146 really doesn't matter what value we use because it will
147 have negligible effect in later calculations
148 */
149 XSIG_LL(accum) = 0x8000000000000000LL;
150 accum.lsw = 0;
151 }
152 else
153 {
154 div_Xsig(&accumulatoro, &accumulatore, &accum);
155 }
156
157 /* Multiply by 1/3 * arg^3 */
158 mul64_Xsig(&accum, &XSIG_LL(argSignif));
159 mul64_Xsig(&accum, &XSIG_LL(argSignif));
160 mul64_Xsig(&accum, &XSIG_LL(argSignif));
161 mul64_Xsig(&accum, &twothirds);
162 shr_Xsig(&accum, -2*(exponent+1));
163
164 /* tan(arg) = arg + accum */
165 add_two_Xsig(&accum, &argSignif, &exponent);
166
167 if ( invert )
168 {
169 /* We now have the value of tan(pi_2 - arg) where pi_2 is an
170 approximation for pi/2
171 */
172 /* The next step is to fix the answer to compensate for the
173 error due to the approximation used for pi/2
174 */
175
176 /* This is (approx) delta, the error in our approx for pi/2
177 (see above). It has an exponent of -65
178 */
179 XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
180 fix_up.lsw = 0;
181
182 if ( exponent == 0 )
183 adj = 0xffffffff; /* We want approx 1.0 here, but
184 this is close enough. */
185 else if ( exponent > -30 )
186 {
187 adj = accum.msw >> -(exponent+1); /* tan */
188 adj = mul_32_32(adj, adj); /* tan^2 */
189 } 150 }
190 else 151
191 adj = 0; 152 /* Multiply by 1/3 * arg^3 */
192 adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ 153 mul64_Xsig(&accum, &XSIG_LL(argSignif));
193 154 mul64_Xsig(&accum, &XSIG_LL(argSignif));
194 fix_up.msw += adj; 155 mul64_Xsig(&accum, &XSIG_LL(argSignif));
195 if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */ 156 mul64_Xsig(&accum, &twothirds);
196 { 157 shr_Xsig(&accum, -2 * (exponent + 1));
197 /* Yes, we need to add an msb */ 158
198 shr_Xsig(&fix_up, 1); 159 /* tan(arg) = arg + accum */
199 fix_up.msw |= 0x80000000; 160 add_two_Xsig(&accum, &argSignif, &exponent);
200 shr_Xsig(&fix_up, 64 + exponent); 161
162 if (invert) {
163 /* We now have the value of tan(pi_2 - arg) where pi_2 is an
164 approximation for pi/2
165 */
166 /* The next step is to fix the answer to compensate for the
167 error due to the approximation used for pi/2
168 */
169
170 /* This is (approx) delta, the error in our approx for pi/2
171 (see above). It has an exponent of -65
172 */
173 XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
174 fix_up.lsw = 0;
175
176 if (exponent == 0)
177 adj = 0xffffffff; /* We want approx 1.0 here, but
178 this is close enough. */
179 else if (exponent > -30) {
180 adj = accum.msw >> -(exponent + 1); /* tan */
181 adj = mul_32_32(adj, adj); /* tan^2 */
182 } else
183 adj = 0;
184 adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */
185
186 fix_up.msw += adj;
187 if (!(fix_up.msw & 0x80000000)) { /* did fix_up overflow ? */
188 /* Yes, we need to add an msb */
189 shr_Xsig(&fix_up, 1);
190 fix_up.msw |= 0x80000000;
191 shr_Xsig(&fix_up, 64 + exponent);
192 } else
193 shr_Xsig(&fix_up, 65 + exponent);
194
195 add_two_Xsig(&accum, &fix_up, &exponent);
196
197 /* accum now contains tan(pi/2 - arg).
198 Use tan(arg) = 1.0 / tan(pi/2 - arg)
199 */
200 accumulatoro.lsw = accumulatoro.midw = 0;
201 accumulatoro.msw = 0x80000000;
202 div_Xsig(&accumulatoro, &accum, &accum);
203 exponent = -exponent - 1;
201 } 204 }
202 else 205
203 shr_Xsig(&fix_up, 65 + exponent); 206 /* Transfer the result */
204 207 round_Xsig(&accum);
205 add_two_Xsig(&accum, &fix_up, &exponent); 208 FPU_settag0(TAG_Valid);
206 209 significand(st0_ptr) = XSIG_LL(accum);
207 /* accum now contains tan(pi/2 - arg). 210 setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
208 Use tan(arg) = 1.0 / tan(pi/2 - arg)
209 */
210 accumulatoro.lsw = accumulatoro.midw = 0;
211 accumulatoro.msw = 0x80000000;
212 div_Xsig(&accumulatoro, &accum, &accum);
213 exponent = - exponent - 1;
214 }
215
216 /* Transfer the result */
217 round_Xsig(&accum);
218 FPU_settag0(TAG_Valid);
219 significand(st0_ptr) = XSIG_LL(accum);
220 setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
221 211
222} 212}
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c
index 7cd3b37ac084..deea48b9f13a 100644
--- a/arch/x86/math-emu/reg_add_sub.c
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -27,7 +27,7 @@
27static 27static
28int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, 28int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
29 FPU_REG const *b, u_char tagb, u_char signb, 29 FPU_REG const *b, u_char tagb, u_char signb,
30 FPU_REG *dest, int deststnr, int control_w); 30 FPU_REG * dest, int deststnr, int control_w);
31 31
32/* 32/*
33 Operates on st(0) and st(n), or on st(0) and temporary data. 33 Operates on st(0) and st(n), or on st(0) and temporary data.
@@ -35,340 +35,299 @@ int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
35 */ 35 */
36int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w) 36int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
37{ 37{
38 FPU_REG *a = &st(0); 38 FPU_REG *a = &st(0);
39 FPU_REG *dest = &st(deststnr); 39 FPU_REG *dest = &st(deststnr);
40 u_char signb = getsign(b); 40 u_char signb = getsign(b);
41 u_char taga = FPU_gettag0(); 41 u_char taga = FPU_gettag0();
42 u_char signa = getsign(a); 42 u_char signa = getsign(a);
43 u_char saved_sign = getsign(dest); 43 u_char saved_sign = getsign(dest);
44 int diff, tag, expa, expb; 44 int diff, tag, expa, expb;
45 45
46 if ( !(taga | tagb) ) 46 if (!(taga | tagb)) {
47 { 47 expa = exponent(a);
48 expa = exponent(a); 48 expb = exponent(b);
49 expb = exponent(b); 49
50 50 valid_add:
51 valid_add: 51 /* Both registers are valid */
52 /* Both registers are valid */ 52 if (!(signa ^ signb)) {
53 if (!(signa ^ signb)) 53 /* signs are the same */
54 { 54 tag =
55 /* signs are the same */ 55 FPU_u_add(a, b, dest, control_w, signa, expa, expb);
56 tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb); 56 } else {
57 } 57 /* The signs are different, so do a subtraction */
58 else 58 diff = expa - expb;
59 { 59 if (!diff) {
60 /* The signs are different, so do a subtraction */ 60 diff = a->sigh - b->sigh; /* This works only if the ms bits
61 diff = expa - expb; 61 are identical. */
62 if (!diff) 62 if (!diff) {
63 { 63 diff = a->sigl > b->sigl;
64 diff = a->sigh - b->sigh; /* This works only if the ms bits 64 if (!diff)
65 are identical. */ 65 diff = -(a->sigl < b->sigl);
66 if (!diff) 66 }
67 { 67 }
68 diff = a->sigl > b->sigl; 68
69 if (!diff) 69 if (diff > 0) {
70 diff = -(a->sigl < b->sigl); 70 tag =
71 FPU_u_sub(a, b, dest, control_w, signa,
72 expa, expb);
73 } else if (diff < 0) {
74 tag =
75 FPU_u_sub(b, a, dest, control_w, signb,
76 expb, expa);
77 } else {
78 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
79 /* sign depends upon rounding mode */
80 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
81 ? SIGN_POS : SIGN_NEG);
82 return TAG_Zero;
83 }
71 } 84 }
72 }
73
74 if (diff > 0)
75 {
76 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
77 }
78 else if ( diff < 0 )
79 {
80 tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa);
81 }
82 else
83 {
84 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
85 /* sign depends upon rounding mode */
86 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
87 ? SIGN_POS : SIGN_NEG);
88 return TAG_Zero;
89 }
90 }
91 85
92 if ( tag < 0 ) 86 if (tag < 0) {
93 { 87 setsign(dest, saved_sign);
94 setsign(dest, saved_sign); 88 return tag;
95 return tag; 89 }
90 FPU_settagi(deststnr, tag);
91 return tag;
96 } 92 }
97 FPU_settagi(deststnr, tag);
98 return tag;
99 }
100 93
101 if ( taga == TAG_Special ) 94 if (taga == TAG_Special)
102 taga = FPU_Special(a); 95 taga = FPU_Special(a);
103 if ( tagb == TAG_Special ) 96 if (tagb == TAG_Special)
104 tagb = FPU_Special(b); 97 tagb = FPU_Special(b);
105 98
106 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 99 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
107 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 100 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
108 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 101 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
109 { 102 FPU_REG x, y;
110 FPU_REG x, y; 103
104 if (denormal_operand() < 0)
105 return FPU_Exception;
106
107 FPU_to_exp16(a, &x);
108 FPU_to_exp16(b, &y);
109 a = &x;
110 b = &y;
111 expa = exponent16(a);
112 expb = exponent16(b);
113 goto valid_add;
114 }
111 115
112 if ( denormal_operand() < 0 ) 116 if ((taga == TW_NaN) || (tagb == TW_NaN)) {
113 return FPU_Exception; 117 if (deststnr == 0)
118 return real_2op_NaN(b, tagb, deststnr, a);
119 else
120 return real_2op_NaN(a, taga, deststnr, a);
121 }
114 122
115 FPU_to_exp16(a, &x); 123 return add_sub_specials(a, taga, signa, b, tagb, signb,
116 FPU_to_exp16(b, &y); 124 dest, deststnr, control_w);
117 a = &x;
118 b = &y;
119 expa = exponent16(a);
120 expb = exponent16(b);
121 goto valid_add;
122 }
123
124 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
125 {
126 if ( deststnr == 0 )
127 return real_2op_NaN(b, tagb, deststnr, a);
128 else
129 return real_2op_NaN(a, taga, deststnr, a);
130 }
131
132 return add_sub_specials(a, taga, signa, b, tagb, signb,
133 dest, deststnr, control_w);
134} 125}
135 126
136
137/* Subtract b from a. (a-b) -> dest */ 127/* Subtract b from a. (a-b) -> dest */
138int FPU_sub(int flags, int rm, int control_w) 128int FPU_sub(int flags, int rm, int control_w)
139{ 129{
140 FPU_REG const *a, *b; 130 FPU_REG const *a, *b;
141 FPU_REG *dest; 131 FPU_REG *dest;
142 u_char taga, tagb, signa, signb, saved_sign, sign; 132 u_char taga, tagb, signa, signb, saved_sign, sign;
143 int diff, tag = 0, expa, expb, deststnr; 133 int diff, tag = 0, expa, expb, deststnr;
144 134
145 a = &st(0); 135 a = &st(0);
146 taga = FPU_gettag0(); 136 taga = FPU_gettag0();
147 137
148 deststnr = 0; 138 deststnr = 0;
149 if ( flags & LOADED ) 139 if (flags & LOADED) {
150 { 140 b = (FPU_REG *) rm;
151 b = (FPU_REG *)rm; 141 tagb = flags & 0x0f;
152 tagb = flags & 0x0f; 142 } else {
153 } 143 b = &st(rm);
154 else 144 tagb = FPU_gettagi(rm);
155 { 145
156 b = &st(rm); 146 if (flags & DEST_RM)
157 tagb = FPU_gettagi(rm); 147 deststnr = rm;
158
159 if ( flags & DEST_RM )
160 deststnr = rm;
161 }
162
163 signa = getsign(a);
164 signb = getsign(b);
165
166 if ( flags & REV )
167 {
168 signa ^= SIGN_NEG;
169 signb ^= SIGN_NEG;
170 }
171
172 dest = &st(deststnr);
173 saved_sign = getsign(dest);
174
175 if ( !(taga | tagb) )
176 {
177 expa = exponent(a);
178 expb = exponent(b);
179
180 valid_subtract:
181 /* Both registers are valid */
182
183 diff = expa - expb;
184
185 if (!diff)
186 {
187 diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
188 if (!diff)
189 {
190 diff = a->sigl > b->sigl;
191 if (!diff)
192 diff = -(a->sigl < b->sigl);
193 }
194 } 148 }
195 149
196 switch ( (((int)signa)*2 + signb) / SIGN_NEG ) 150 signa = getsign(a);
197 { 151 signb = getsign(b);
198 case 0: /* P - P */ 152
199 case 3: /* N - N */ 153 if (flags & REV) {
200 if (diff > 0) 154 signa ^= SIGN_NEG;
201 { 155 signb ^= SIGN_NEG;
202 /* |a| > |b| */ 156 }
203 tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); 157
204 } 158 dest = &st(deststnr);
205 else if ( diff == 0 ) 159 saved_sign = getsign(dest);
206 { 160
207 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 161 if (!(taga | tagb)) {
208 162 expa = exponent(a);
209 /* sign depends upon rounding mode */ 163 expb = exponent(b);
210 setsign(dest, ((control_w & CW_RC) != RC_DOWN) 164
211 ? SIGN_POS : SIGN_NEG); 165 valid_subtract:
212 return TAG_Zero; 166 /* Both registers are valid */
213 } 167
214 else 168 diff = expa - expb;
215 { 169
216 sign = signa ^ SIGN_NEG; 170 if (!diff) {
217 tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa); 171 diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
218 } 172 if (!diff) {
219 break; 173 diff = a->sigl > b->sigl;
220 case 1: /* P - N */ 174 if (!diff)
221 tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb); 175 diff = -(a->sigl < b->sigl);
222 break; 176 }
223 case 2: /* N - P */ 177 }
224 tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb); 178
225 break; 179 switch ((((int)signa) * 2 + signb) / SIGN_NEG) {
180 case 0: /* P - P */
181 case 3: /* N - N */
182 if (diff > 0) {
183 /* |a| > |b| */
184 tag =
185 FPU_u_sub(a, b, dest, control_w, signa,
186 expa, expb);
187 } else if (diff == 0) {
188 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
189
190 /* sign depends upon rounding mode */
191 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
192 ? SIGN_POS : SIGN_NEG);
193 return TAG_Zero;
194 } else {
195 sign = signa ^ SIGN_NEG;
196 tag =
197 FPU_u_sub(b, a, dest, control_w, sign, expb,
198 expa);
199 }
200 break;
201 case 1: /* P - N */
202 tag =
203 FPU_u_add(a, b, dest, control_w, SIGN_POS, expa,
204 expb);
205 break;
206 case 2: /* N - P */
207 tag =
208 FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa,
209 expb);
210 break;
226#ifdef PARANOID 211#ifdef PARANOID
227 default: 212 default:
228 EXCEPTION(EX_INTERNAL|0x111); 213 EXCEPTION(EX_INTERNAL | 0x111);
229 return -1; 214 return -1;
230#endif 215#endif
216 }
217 if (tag < 0) {
218 setsign(dest, saved_sign);
219 return tag;
220 }
221 FPU_settagi(deststnr, tag);
222 return tag;
231 } 223 }
232 if ( tag < 0 )
233 {
234 setsign(dest, saved_sign);
235 return tag;
236 }
237 FPU_settagi(deststnr, tag);
238 return tag;
239 }
240 224
241 if ( taga == TAG_Special ) 225 if (taga == TAG_Special)
242 taga = FPU_Special(a); 226 taga = FPU_Special(a);
243 if ( tagb == TAG_Special ) 227 if (tagb == TAG_Special)
244 tagb = FPU_Special(b); 228 tagb = FPU_Special(b);
245 229
246 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 230 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
247 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 231 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
248 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 232 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
249 { 233 FPU_REG x, y;
250 FPU_REG x, y;
251 234
252 if ( denormal_operand() < 0 ) 235 if (denormal_operand() < 0)
253 return FPU_Exception; 236 return FPU_Exception;
237
238 FPU_to_exp16(a, &x);
239 FPU_to_exp16(b, &y);
240 a = &x;
241 b = &y;
242 expa = exponent16(a);
243 expb = exponent16(b);
254 244
255 FPU_to_exp16(a, &x); 245 goto valid_subtract;
256 FPU_to_exp16(b, &y);
257 a = &x;
258 b = &y;
259 expa = exponent16(a);
260 expb = exponent16(b);
261
262 goto valid_subtract;
263 }
264
265 if ( (taga == TW_NaN) || (tagb == TW_NaN) )
266 {
267 FPU_REG const *d1, *d2;
268 if ( flags & REV )
269 {
270 d1 = b;
271 d2 = a;
272 } 246 }
273 else 247
274 { 248 if ((taga == TW_NaN) || (tagb == TW_NaN)) {
275 d1 = a; 249 FPU_REG const *d1, *d2;
276 d2 = b; 250 if (flags & REV) {
251 d1 = b;
252 d2 = a;
253 } else {
254 d1 = a;
255 d2 = b;
256 }
257 if (flags & LOADED)
258 return real_2op_NaN(b, tagb, deststnr, d1);
259 if (flags & DEST_RM)
260 return real_2op_NaN(a, taga, deststnr, d2);
261 else
262 return real_2op_NaN(b, tagb, deststnr, d2);
277 } 263 }
278 if ( flags & LOADED )
279 return real_2op_NaN(b, tagb, deststnr, d1);
280 if ( flags & DEST_RM )
281 return real_2op_NaN(a, taga, deststnr, d2);
282 else
283 return real_2op_NaN(b, tagb, deststnr, d2);
284 }
285
286 return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
287 dest, deststnr, control_w);
288}
289 264
265 return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
266 dest, deststnr, control_w);
267}
290 268
291static 269static
292int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, 270int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
293 FPU_REG const *b, u_char tagb, u_char signb, 271 FPU_REG const *b, u_char tagb, u_char signb,
294 FPU_REG *dest, int deststnr, int control_w) 272 FPU_REG * dest, int deststnr, int control_w)
295{ 273{
296 if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) 274 if (((taga == TW_Denormal) || (tagb == TW_Denormal))
297 && (denormal_operand() < 0) ) 275 && (denormal_operand() < 0))
298 return FPU_Exception; 276 return FPU_Exception;
299 277
300 if (taga == TAG_Zero) 278 if (taga == TAG_Zero) {
301 { 279 if (tagb == TAG_Zero) {
302 if (tagb == TAG_Zero) 280 /* Both are zero, result will be zero. */
303 { 281 u_char different_signs = signa ^ signb;
304 /* Both are zero, result will be zero. */ 282
305 u_char different_signs = signa ^ signb; 283 FPU_copy_to_regi(a, TAG_Zero, deststnr);
306 284 if (different_signs) {
307 FPU_copy_to_regi(a, TAG_Zero, deststnr); 285 /* Signs are different. */
308 if ( different_signs ) 286 /* Sign of answer depends upon rounding mode. */
309 { 287 setsign(dest, ((control_w & CW_RC) != RC_DOWN)
310 /* Signs are different. */ 288 ? SIGN_POS : SIGN_NEG);
311 /* Sign of answer depends upon rounding mode. */ 289 } else
312 setsign(dest, ((control_w & CW_RC) != RC_DOWN) 290 setsign(dest, signa); /* signa may differ from the sign of a. */
313 ? SIGN_POS : SIGN_NEG); 291 return TAG_Zero;
314 } 292 } else {
315 else 293 reg_copy(b, dest);
316 setsign(dest, signa); /* signa may differ from the sign of a. */ 294 if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) {
317 return TAG_Zero; 295 /* A pseudoDenormal, convert it. */
318 } 296 addexponent(dest, 1);
319 else 297 tagb = TAG_Valid;
320 { 298 } else if (tagb > TAG_Empty)
321 reg_copy(b, dest); 299 tagb = TAG_Special;
322 if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) ) 300 setsign(dest, signb); /* signb may differ from the sign of b. */
323 { 301 FPU_settagi(deststnr, tagb);
324 /* A pseudoDenormal, convert it. */ 302 return tagb;
325 addexponent(dest, 1); 303 }
326 tagb = TAG_Valid; 304 } else if (tagb == TAG_Zero) {
327 } 305 reg_copy(a, dest);
328 else if ( tagb > TAG_Empty ) 306 if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) {
329 tagb = TAG_Special; 307 /* A pseudoDenormal */
330 setsign(dest, signb); /* signb may differ from the sign of b. */ 308 addexponent(dest, 1);
331 FPU_settagi(deststnr, tagb); 309 taga = TAG_Valid;
332 return tagb; 310 } else if (taga > TAG_Empty)
333 } 311 taga = TAG_Special;
334 } 312 setsign(dest, signa); /* signa may differ from the sign of a. */
335 else if (tagb == TAG_Zero) 313 FPU_settagi(deststnr, taga);
336 { 314 return taga;
337 reg_copy(a, dest); 315 } else if (taga == TW_Infinity) {
338 if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) ) 316 if ((tagb != TW_Infinity) || (signa == signb)) {
339 { 317 FPU_copy_to_regi(a, TAG_Special, deststnr);
340 /* A pseudoDenormal */ 318 setsign(dest, signa); /* signa may differ from the sign of a. */
341 addexponent(dest, 1); 319 return taga;
342 taga = TAG_Valid; 320 }
343 } 321 /* Infinity-Infinity is undefined. */
344 else if ( taga > TAG_Empty ) 322 return arith_invalid(deststnr);
345 taga = TAG_Special; 323 } else if (tagb == TW_Infinity) {
346 setsign(dest, signa); /* signa may differ from the sign of a. */ 324 FPU_copy_to_regi(b, TAG_Special, deststnr);
347 FPU_settagi(deststnr, taga); 325 setsign(dest, signb); /* signb may differ from the sign of b. */
348 return taga; 326 return tagb;
349 }
350 else if (taga == TW_Infinity)
351 {
352 if ( (tagb != TW_Infinity) || (signa == signb) )
353 {
354 FPU_copy_to_regi(a, TAG_Special, deststnr);
355 setsign(dest, signa); /* signa may differ from the sign of a. */
356 return taga;
357 } 327 }
358 /* Infinity-Infinity is undefined. */
359 return arith_invalid(deststnr);
360 }
361 else if (tagb == TW_Infinity)
362 {
363 FPU_copy_to_regi(b, TAG_Special, deststnr);
364 setsign(dest, signb); /* signb may differ from the sign of b. */
365 return tagb;
366 }
367
368#ifdef PARANOID 328#ifdef PARANOID
369 EXCEPTION(EX_INTERNAL|0x101); 329 EXCEPTION(EX_INTERNAL | 0x101);
370#endif 330#endif
371 331
372 return FPU_Exception; 332 return FPU_Exception;
373} 333}
374
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index f37c5b5a35ad..ecce55fc2e2e 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -20,362 +20,331 @@
20#include "control_w.h" 20#include "control_w.h"
21#include "status_w.h" 21#include "status_w.h"
22 22
23
24static int compare(FPU_REG const *b, int tagb) 23static int compare(FPU_REG const *b, int tagb)
25{ 24{
26 int diff, exp0, expb; 25 int diff, exp0, expb;
27 u_char st0_tag; 26 u_char st0_tag;
28 FPU_REG *st0_ptr; 27 FPU_REG *st0_ptr;
29 FPU_REG x, y; 28 FPU_REG x, y;
30 u_char st0_sign, signb = getsign(b); 29 u_char st0_sign, signb = getsign(b);
31 30
32 st0_ptr = &st(0); 31 st0_ptr = &st(0);
33 st0_tag = FPU_gettag0(); 32 st0_tag = FPU_gettag0();
34 st0_sign = getsign(st0_ptr); 33 st0_sign = getsign(st0_ptr);
35 34
36 if ( tagb == TAG_Special ) 35 if (tagb == TAG_Special)
37 tagb = FPU_Special(b); 36 tagb = FPU_Special(b);
38 if ( st0_tag == TAG_Special ) 37 if (st0_tag == TAG_Special)
39 st0_tag = FPU_Special(st0_ptr); 38 st0_tag = FPU_Special(st0_ptr);
40 39
41 if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) 40 if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
42 || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) ) 41 || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) {
43 { 42 if (st0_tag == TAG_Zero) {
44 if ( st0_tag == TAG_Zero ) 43 if (tagb == TAG_Zero)
45 { 44 return COMP_A_eq_B;
46 if ( tagb == TAG_Zero ) return COMP_A_eq_B; 45 if (tagb == TAG_Valid)
47 if ( tagb == TAG_Valid ) 46 return ((signb ==
48 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); 47 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
49 if ( tagb == TW_Denormal ) 48 if (tagb == TW_Denormal)
50 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) 49 return ((signb ==
51 | COMP_Denormal; 50 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
52 } 51 | COMP_Denormal;
53 else if ( tagb == TAG_Zero ) 52 } else if (tagb == TAG_Zero) {
54 { 53 if (st0_tag == TAG_Valid)
55 if ( st0_tag == TAG_Valid ) 54 return ((st0_sign ==
56 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); 55 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
57 if ( st0_tag == TW_Denormal ) 56 if (st0_tag == TW_Denormal)
58 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) 57 return ((st0_sign ==
59 | COMP_Denormal; 58 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
59 | COMP_Denormal;
60 }
61
62 if (st0_tag == TW_Infinity) {
63 if ((tagb == TAG_Valid) || (tagb == TAG_Zero))
64 return ((st0_sign ==
65 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
66 else if (tagb == TW_Denormal)
67 return ((st0_sign ==
68 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
69 | COMP_Denormal;
70 else if (tagb == TW_Infinity) {
71 /* The 80486 book says that infinities can be equal! */
72 return (st0_sign == signb) ? COMP_A_eq_B :
73 ((st0_sign ==
74 SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
75 }
76 /* Fall through to the NaN code */
77 } else if (tagb == TW_Infinity) {
78 if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero))
79 return ((signb ==
80 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
81 if (st0_tag == TW_Denormal)
82 return ((signb ==
83 SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
84 | COMP_Denormal;
85 /* Fall through to the NaN code */
86 }
87
88 /* The only possibility now should be that one of the arguments
89 is a NaN */
90 if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) {
91 int signalling = 0, unsupported = 0;
92 if (st0_tag == TW_NaN) {
93 signalling =
94 (st0_ptr->sigh & 0xc0000000) == 0x80000000;
95 unsupported = !((exponent(st0_ptr) == EXP_OVER)
96 && (st0_ptr->
97 sigh & 0x80000000));
98 }
99 if (tagb == TW_NaN) {
100 signalling |=
101 (b->sigh & 0xc0000000) == 0x80000000;
102 unsupported |= !((exponent(b) == EXP_OVER)
103 && (b->sigh & 0x80000000));
104 }
105 if (signalling || unsupported)
106 return COMP_No_Comp | COMP_SNaN | COMP_NaN;
107 else
108 /* Neither is a signaling NaN */
109 return COMP_No_Comp | COMP_NaN;
110 }
111
112 EXCEPTION(EX_Invalid);
60 } 113 }
61 114
62 if ( st0_tag == TW_Infinity ) 115 if (st0_sign != signb) {
63 { 116 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
64 if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) ) 117 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
65 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); 118 COMP_Denormal : 0);
66 else if ( tagb == TW_Denormal )
67 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
68 | COMP_Denormal;
69 else if ( tagb == TW_Infinity )
70 {
71 /* The 80486 book says that infinities can be equal! */
72 return (st0_sign == signb) ? COMP_A_eq_B :
73 ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
74 }
75 /* Fall through to the NaN code */
76 }
77 else if ( tagb == TW_Infinity )
78 {
79 if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) )
80 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
81 if ( st0_tag == TW_Denormal )
82 return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
83 | COMP_Denormal;
84 /* Fall through to the NaN code */
85 } 119 }
86 120
87 /* The only possibility now should be that one of the arguments 121 if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) {
88 is a NaN */ 122 FPU_to_exp16(st0_ptr, &x);
89 if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) ) 123 FPU_to_exp16(b, &y);
90 { 124 st0_ptr = &x;
91 int signalling = 0, unsupported = 0; 125 b = &y;
92 if ( st0_tag == TW_NaN ) 126 exp0 = exponent16(st0_ptr);
93 { 127 expb = exponent16(b);
94 signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000; 128 } else {
95 unsupported = !((exponent(st0_ptr) == EXP_OVER) 129 exp0 = exponent(st0_ptr);
96 && (st0_ptr->sigh & 0x80000000)); 130 expb = exponent(b);
97 }
98 if ( tagb == TW_NaN )
99 {
100 signalling |= (b->sigh & 0xc0000000) == 0x80000000;
101 unsupported |= !((exponent(b) == EXP_OVER)
102 && (b->sigh & 0x80000000));
103 }
104 if ( signalling || unsupported )
105 return COMP_No_Comp | COMP_SNaN | COMP_NaN;
106 else
107 /* Neither is a signaling NaN */
108 return COMP_No_Comp | COMP_NaN;
109 } 131 }
110
111 EXCEPTION(EX_Invalid);
112 }
113
114 if (st0_sign != signb)
115 {
116 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
117 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
118 COMP_Denormal : 0);
119 }
120
121 if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) )
122 {
123 FPU_to_exp16(st0_ptr, &x);
124 FPU_to_exp16(b, &y);
125 st0_ptr = &x;
126 b = &y;
127 exp0 = exponent16(st0_ptr);
128 expb = exponent16(b);
129 }
130 else
131 {
132 exp0 = exponent(st0_ptr);
133 expb = exponent(b);
134 }
135 132
136#ifdef PARANOID 133#ifdef PARANOID
137 if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid); 134 if (!(st0_ptr->sigh & 0x80000000))
138 if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid); 135 EXCEPTION(EX_Invalid);
136 if (!(b->sigh & 0x80000000))
137 EXCEPTION(EX_Invalid);
139#endif /* PARANOID */ 138#endif /* PARANOID */
140 139
141 diff = exp0 - expb; 140 diff = exp0 - expb;
142 if ( diff == 0 ) 141 if (diff == 0) {
143 { 142 diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are
144 diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are 143 identical */
145 identical */ 144 if (diff == 0) {
146 if ( diff == 0 ) 145 diff = st0_ptr->sigl > b->sigl;
147 { 146 if (diff == 0)
148 diff = st0_ptr->sigl > b->sigl; 147 diff = -(st0_ptr->sigl < b->sigl);
149 if ( diff == 0 ) 148 }
150 diff = -(st0_ptr->sigl < b->sigl);
151 } 149 }
152 }
153
154 if ( diff > 0 )
155 {
156 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
157 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
158 COMP_Denormal : 0);
159 }
160 if ( diff < 0 )
161 {
162 return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
163 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
164 COMP_Denormal : 0);
165 }
166
167 return COMP_A_eq_B
168 | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
169 COMP_Denormal : 0);
170 150
171} 151 if (diff > 0) {
152 return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
153 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
154 COMP_Denormal : 0);
155 }
156 if (diff < 0) {
157 return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
158 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
159 COMP_Denormal : 0);
160 }
172 161
162 return COMP_A_eq_B
163 | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
164 COMP_Denormal : 0);
165
166}
173 167
174/* This function requires that st(0) is not empty */ 168/* This function requires that st(0) is not empty */
175int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) 169int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
176{ 170{
177 int f = 0, c; 171 int f = 0, c;
178 172
179 c = compare(loaded_data, loaded_tag); 173 c = compare(loaded_data, loaded_tag);
180 174
181 if (c & COMP_NaN) 175 if (c & COMP_NaN) {
182 { 176 EXCEPTION(EX_Invalid);
183 EXCEPTION(EX_Invalid); 177 f = SW_C3 | SW_C2 | SW_C0;
184 f = SW_C3 | SW_C2 | SW_C0; 178 } else
185 } 179 switch (c & 7) {
186 else 180 case COMP_A_lt_B:
187 switch (c & 7) 181 f = SW_C0;
188 { 182 break;
189 case COMP_A_lt_B: 183 case COMP_A_eq_B:
190 f = SW_C0; 184 f = SW_C3;
191 break; 185 break;
192 case COMP_A_eq_B: 186 case COMP_A_gt_B:
193 f = SW_C3; 187 f = 0;
194 break; 188 break;
195 case COMP_A_gt_B: 189 case COMP_No_Comp:
196 f = 0; 190 f = SW_C3 | SW_C2 | SW_C0;
197 break; 191 break;
198 case COMP_No_Comp:
199 f = SW_C3 | SW_C2 | SW_C0;
200 break;
201#ifdef PARANOID 192#ifdef PARANOID
202 default: 193 default:
203 EXCEPTION(EX_INTERNAL|0x121); 194 EXCEPTION(EX_INTERNAL | 0x121);
204 f = SW_C3 | SW_C2 | SW_C0; 195 f = SW_C3 | SW_C2 | SW_C0;
205 break; 196 break;
206#endif /* PARANOID */ 197#endif /* PARANOID */
207 } 198 }
208 setcc(f); 199 setcc(f);
209 if (c & COMP_Denormal) 200 if (c & COMP_Denormal) {
210 { 201 return denormal_operand() < 0;
211 return denormal_operand() < 0; 202 }
212 } 203 return 0;
213 return 0;
214} 204}
215 205
216
217static int compare_st_st(int nr) 206static int compare_st_st(int nr)
218{ 207{
219 int f = 0, c; 208 int f = 0, c;
220 FPU_REG *st_ptr; 209 FPU_REG *st_ptr;
221 210
222 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) 211 if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
223 { 212 setcc(SW_C3 | SW_C2 | SW_C0);
224 setcc(SW_C3 | SW_C2 | SW_C0); 213 /* Stack fault */
225 /* Stack fault */ 214 EXCEPTION(EX_StackUnder);
226 EXCEPTION(EX_StackUnder); 215 return !(control_word & CW_Invalid);
227 return !(control_word & CW_Invalid); 216 }
228 } 217
229 218 st_ptr = &st(nr);
230 st_ptr = &st(nr); 219 c = compare(st_ptr, FPU_gettagi(nr));
231 c = compare(st_ptr, FPU_gettagi(nr)); 220 if (c & COMP_NaN) {
232 if (c & COMP_NaN) 221 setcc(SW_C3 | SW_C2 | SW_C0);
233 { 222 EXCEPTION(EX_Invalid);
234 setcc(SW_C3 | SW_C2 | SW_C0); 223 return !(control_word & CW_Invalid);
235 EXCEPTION(EX_Invalid); 224 } else
236 return !(control_word & CW_Invalid); 225 switch (c & 7) {
237 } 226 case COMP_A_lt_B:
238 else 227 f = SW_C0;
239 switch (c & 7) 228 break;
240 { 229 case COMP_A_eq_B:
241 case COMP_A_lt_B: 230 f = SW_C3;
242 f = SW_C0; 231 break;
243 break; 232 case COMP_A_gt_B:
244 case COMP_A_eq_B: 233 f = 0;
245 f = SW_C3; 234 break;
246 break; 235 case COMP_No_Comp:
247 case COMP_A_gt_B: 236 f = SW_C3 | SW_C2 | SW_C0;
248 f = 0; 237 break;
249 break;
250 case COMP_No_Comp:
251 f = SW_C3 | SW_C2 | SW_C0;
252 break;
253#ifdef PARANOID 238#ifdef PARANOID
254 default: 239 default:
255 EXCEPTION(EX_INTERNAL|0x122); 240 EXCEPTION(EX_INTERNAL | 0x122);
256 f = SW_C3 | SW_C2 | SW_C0; 241 f = SW_C3 | SW_C2 | SW_C0;
257 break; 242 break;
258#endif /* PARANOID */ 243#endif /* PARANOID */
259 } 244 }
260 setcc(f); 245 setcc(f);
261 if (c & COMP_Denormal) 246 if (c & COMP_Denormal) {
262 { 247 return denormal_operand() < 0;
263 return denormal_operand() < 0; 248 }
264 } 249 return 0;
265 return 0;
266} 250}
267 251
268
269static int compare_u_st_st(int nr) 252static int compare_u_st_st(int nr)
270{ 253{
271 int f = 0, c; 254 int f = 0, c;
272 FPU_REG *st_ptr; 255 FPU_REG *st_ptr;
273 256
274 if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) 257 if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
275 { 258 setcc(SW_C3 | SW_C2 | SW_C0);
276 setcc(SW_C3 | SW_C2 | SW_C0); 259 /* Stack fault */
277 /* Stack fault */ 260 EXCEPTION(EX_StackUnder);
278 EXCEPTION(EX_StackUnder); 261 return !(control_word & CW_Invalid);
279 return !(control_word & CW_Invalid);
280 }
281
282 st_ptr = &st(nr);
283 c = compare(st_ptr, FPU_gettagi(nr));
284 if (c & COMP_NaN)
285 {
286 setcc(SW_C3 | SW_C2 | SW_C0);
287 if (c & COMP_SNaN) /* This is the only difference between
288 un-ordered and ordinary comparisons */
289 {
290 EXCEPTION(EX_Invalid);
291 return !(control_word & CW_Invalid);
292 } 262 }
293 return 0; 263
294 } 264 st_ptr = &st(nr);
295 else 265 c = compare(st_ptr, FPU_gettagi(nr));
296 switch (c & 7) 266 if (c & COMP_NaN) {
297 { 267 setcc(SW_C3 | SW_C2 | SW_C0);
298 case COMP_A_lt_B: 268 if (c & COMP_SNaN) { /* This is the only difference between
299 f = SW_C0; 269 un-ordered and ordinary comparisons */
300 break; 270 EXCEPTION(EX_Invalid);
301 case COMP_A_eq_B: 271 return !(control_word & CW_Invalid);
302 f = SW_C3; 272 }
303 break; 273 return 0;
304 case COMP_A_gt_B: 274 } else
305 f = 0; 275 switch (c & 7) {
306 break; 276 case COMP_A_lt_B:
307 case COMP_No_Comp: 277 f = SW_C0;
308 f = SW_C3 | SW_C2 | SW_C0; 278 break;
309 break; 279 case COMP_A_eq_B:
280 f = SW_C3;
281 break;
282 case COMP_A_gt_B:
283 f = 0;
284 break;
285 case COMP_No_Comp:
286 f = SW_C3 | SW_C2 | SW_C0;
287 break;
310#ifdef PARANOID 288#ifdef PARANOID
311 default: 289 default:
312 EXCEPTION(EX_INTERNAL|0x123); 290 EXCEPTION(EX_INTERNAL | 0x123);
313 f = SW_C3 | SW_C2 | SW_C0; 291 f = SW_C3 | SW_C2 | SW_C0;
314 break; 292 break;
315#endif /* PARANOID */ 293#endif /* PARANOID */
316 } 294 }
317 setcc(f); 295 setcc(f);
318 if (c & COMP_Denormal) 296 if (c & COMP_Denormal) {
319 { 297 return denormal_operand() < 0;
320 return denormal_operand() < 0; 298 }
321 } 299 return 0;
322 return 0;
323} 300}
324 301
325/*---------------------------------------------------------------------------*/ 302/*---------------------------------------------------------------------------*/
326 303
327void fcom_st(void) 304void fcom_st(void)
328{ 305{
329 /* fcom st(i) */ 306 /* fcom st(i) */
330 compare_st_st(FPU_rm); 307 compare_st_st(FPU_rm);
331} 308}
332 309
333
334void fcompst(void) 310void fcompst(void)
335{ 311{
336 /* fcomp st(i) */ 312 /* fcomp st(i) */
337 if ( !compare_st_st(FPU_rm) ) 313 if (!compare_st_st(FPU_rm))
338 FPU_pop(); 314 FPU_pop();
339} 315}
340 316
341
342void fcompp(void) 317void fcompp(void)
343{ 318{
344 /* fcompp */ 319 /* fcompp */
345 if (FPU_rm != 1) 320 if (FPU_rm != 1) {
346 { 321 FPU_illegal();
347 FPU_illegal(); 322 return;
348 return; 323 }
349 } 324 if (!compare_st_st(1))
350 if ( !compare_st_st(1) ) 325 poppop();
351 poppop();
352} 326}
353 327
354
355void fucom_(void) 328void fucom_(void)
356{ 329{
357 /* fucom st(i) */ 330 /* fucom st(i) */
358 compare_u_st_st(FPU_rm); 331 compare_u_st_st(FPU_rm);
359 332
360} 333}
361 334
362
363void fucomp(void) 335void fucomp(void)
364{ 336{
365 /* fucomp st(i) */ 337 /* fucomp st(i) */
366 if ( !compare_u_st_st(FPU_rm) ) 338 if (!compare_u_st_st(FPU_rm))
367 FPU_pop(); 339 FPU_pop();
368} 340}
369 341
370
371void fucompp(void) 342void fucompp(void)
372{ 343{
373 /* fucompp */ 344 /* fucompp */
374 if (FPU_rm == 1) 345 if (FPU_rm == 1) {
375 { 346 if (!compare_u_st_st(1))
376 if ( !compare_u_st_st(1) ) 347 poppop();
377 poppop(); 348 } else
378 } 349 FPU_illegal();
379 else
380 FPU_illegal();
381} 350}
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index a85015801969..04869e64b18e 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -16,29 +16,28 @@
16#include "reg_constant.h" 16#include "reg_constant.h"
17#include "control_w.h" 17#include "control_w.h"
18 18
19
20#define MAKE_REG(s,e,l,h) { l, h, \ 19#define MAKE_REG(s,e,l,h) { l, h, \
21 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } 20 ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
22 21
23FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); 22FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
24#if 0 23#if 0
25FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); 24FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
26FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000); 25FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
27#endif /* 0 */ 26#endif /* 0 */
28static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); 27static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
29static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); 28static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
30FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); 29FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
31FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); 30FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
32FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); 31FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
33static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); 32static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
34static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); 33static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
35 34
36/* Extra bits to take pi/2 to more than 128 bits precision. */ 35/* Extra bits to take pi/2 to more than 128 bits precision. */
37FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, 36FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
38 0xfc8f8cbb, 0xece675d1); 37 0xfc8f8cbb, 0xece675d1);
39 38
40/* Only the sign (and tag) is used in internal zeroes */ 39/* Only the sign (and tag) is used in internal zeroes */
41FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); 40FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
42 41
43/* Only the sign and significand (and tag) are used in internal NaNs */ 42/* Only the sign and significand (and tag) are used in internal NaNs */
44/* The 80486 never generates one of these 43/* The 80486 never generates one of these
@@ -48,24 +47,22 @@ FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
48FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); 47FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
49 48
50/* Only the sign (and tag) is used in internal infinities */ 49/* Only the sign (and tag) is used in internal infinities */
51FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); 50FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
52
53 51
54static void fld_const(FPU_REG const *c, int adj, u_char tag) 52static void fld_const(FPU_REG const *c, int adj, u_char tag)
55{ 53{
56 FPU_REG *st_new_ptr; 54 FPU_REG *st_new_ptr;
57 55
58 if ( STACK_OVERFLOW ) 56 if (STACK_OVERFLOW) {
59 { 57 FPU_stack_overflow();
60 FPU_stack_overflow(); 58 return;
61 return; 59 }
62 } 60 push();
63 push(); 61 reg_copy(c, st_new_ptr);
64 reg_copy(c, st_new_ptr); 62 st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to
65 st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to 63 borrow or carry. */
66 borrow or carry. */ 64 FPU_settag0(tag);
67 FPU_settag0(tag); 65 clear_C1();
68 clear_C1();
69} 66}
70 67
71/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP 68/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
@@ -75,46 +72,46 @@ static void fld_const(FPU_REG const *c, int adj, u_char tag)
75 72
76static void fld1(int rc) 73static void fld1(int rc)
77{ 74{
78 fld_const(&CONST_1, 0, TAG_Valid); 75 fld_const(&CONST_1, 0, TAG_Valid);
79} 76}
80 77
81static void fldl2t(int rc) 78static void fldl2t(int rc)
82{ 79{
83 fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); 80 fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
84} 81}
85 82
86static void fldl2e(int rc) 83static void fldl2e(int rc)
87{ 84{
88 fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 85 fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
89} 86}
90 87
91static void fldpi(int rc) 88static void fldpi(int rc)
92{ 89{
93 fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 90 fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
94} 91}
95 92
96static void fldlg2(int rc) 93static void fldlg2(int rc)
97{ 94{
98 fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 95 fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
99} 96}
100 97
101static void fldln2(int rc) 98static void fldln2(int rc)
102{ 99{
103 fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); 100 fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
104} 101}
105 102
106static void fldz(int rc) 103static void fldz(int rc)
107{ 104{
108 fld_const(&CONST_Z, 0, TAG_Zero); 105 fld_const(&CONST_Z, 0, TAG_Zero);
109} 106}
110 107
111typedef void (*FUNC_RC)(int); 108typedef void (*FUNC_RC) (int);
112 109
113static FUNC_RC constants_table[] = { 110static FUNC_RC constants_table[] = {
114 fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal 111 fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
115}; 112};
116 113
117void fconst(void) 114void fconst(void)
118{ 115{
119 (constants_table[FPU_rm])(control_word & CW_RC); 116 (constants_table[FPU_rm]) (control_word & CW_RC);
120} 117}
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c
index 45a258752703..108060779977 100644
--- a/arch/x86/math-emu/reg_convert.c
+++ b/arch/x86/math-emu/reg_convert.c
@@ -13,41 +13,34 @@
13#include "exception.h" 13#include "exception.h"
14#include "fpu_emu.h" 14#include "fpu_emu.h"
15 15
16
17int FPU_to_exp16(FPU_REG const *a, FPU_REG *x) 16int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
18{ 17{
19 int sign = getsign(a); 18 int sign = getsign(a);
20 19
21 *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); 20 *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
22 21
23 /* Set up the exponent as a 16 bit quantity. */ 22 /* Set up the exponent as a 16 bit quantity. */
24 setexponent16(x, exponent(a)); 23 setexponent16(x, exponent(a));
25 24
26 if ( exponent16(x) == EXP_UNDER ) 25 if (exponent16(x) == EXP_UNDER) {
27 { 26 /* The number is a de-normal or pseudodenormal. */
28 /* The number is a de-normal or pseudodenormal. */ 27 /* We only deal with the significand and exponent. */
29 /* We only deal with the significand and exponent. */ 28
30 29 if (x->sigh & 0x80000000) {
31 if (x->sigh & 0x80000000) 30 /* Is a pseudodenormal. */
32 { 31 /* This is non-80486 behaviour because the number
33 /* Is a pseudodenormal. */ 32 loses its 'denormal' identity. */
34 /* This is non-80486 behaviour because the number 33 addexponent(x, 1);
35 loses its 'denormal' identity. */ 34 } else {
36 addexponent(x, 1); 35 /* Is a denormal. */
37 } 36 addexponent(x, 1);
38 else 37 FPU_normalize_nuo(x);
39 { 38 }
40 /* Is a denormal. */
41 addexponent(x, 1);
42 FPU_normalize_nuo(x);
43 } 39 }
44 }
45 40
46 if ( !(x->sigh & 0x80000000) ) 41 if (!(x->sigh & 0x80000000)) {
47 { 42 EXCEPTION(EX_INTERNAL | 0x180);
48 EXCEPTION(EX_INTERNAL | 0x180); 43 }
49 }
50 44
51 return sign; 45 return sign;
52} 46}
53
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c
index 5cee7ff920d9..6827012db341 100644
--- a/arch/x86/math-emu/reg_divide.c
+++ b/arch/x86/math-emu/reg_divide.c
@@ -26,182 +26,157 @@
26 */ 26 */
27int FPU_div(int flags, int rm, int control_w) 27int FPU_div(int flags, int rm, int control_w)
28{ 28{
29 FPU_REG x, y; 29 FPU_REG x, y;
30 FPU_REG const *a, *b, *st0_ptr, *st_ptr; 30 FPU_REG const *a, *b, *st0_ptr, *st_ptr;
31 FPU_REG *dest; 31 FPU_REG *dest;
32 u_char taga, tagb, signa, signb, sign, saved_sign; 32 u_char taga, tagb, signa, signb, sign, saved_sign;
33 int tag, deststnr; 33 int tag, deststnr;
34 34
35 if ( flags & DEST_RM ) 35 if (flags & DEST_RM)
36 deststnr = rm; 36 deststnr = rm;
37 else 37 else
38 deststnr = 0; 38 deststnr = 0;
39 39
40 if ( flags & REV ) 40 if (flags & REV) {
41 { 41 b = &st(0);
42 b = &st(0); 42 st0_ptr = b;
43 st0_ptr = b; 43 tagb = FPU_gettag0();
44 tagb = FPU_gettag0(); 44 if (flags & LOADED) {
45 if ( flags & LOADED ) 45 a = (FPU_REG *) rm;
46 { 46 taga = flags & 0x0f;
47 a = (FPU_REG *)rm; 47 } else {
48 taga = flags & 0x0f; 48 a = &st(rm);
49 st_ptr = a;
50 taga = FPU_gettagi(rm);
51 }
52 } else {
53 a = &st(0);
54 st0_ptr = a;
55 taga = FPU_gettag0();
56 if (flags & LOADED) {
57 b = (FPU_REG *) rm;
58 tagb = flags & 0x0f;
59 } else {
60 b = &st(rm);
61 st_ptr = b;
62 tagb = FPU_gettagi(rm);
63 }
49 } 64 }
50 else
51 {
52 a = &st(rm);
53 st_ptr = a;
54 taga = FPU_gettagi(rm);
55 }
56 }
57 else
58 {
59 a = &st(0);
60 st0_ptr = a;
61 taga = FPU_gettag0();
62 if ( flags & LOADED )
63 {
64 b = (FPU_REG *)rm;
65 tagb = flags & 0x0f;
66 }
67 else
68 {
69 b = &st(rm);
70 st_ptr = b;
71 tagb = FPU_gettagi(rm);
72 }
73 }
74 65
75 signa = getsign(a); 66 signa = getsign(a);
76 signb = getsign(b); 67 signb = getsign(b);
77 68
78 sign = signa ^ signb; 69 sign = signa ^ signb;
79 70
80 dest = &st(deststnr); 71 dest = &st(deststnr);
81 saved_sign = getsign(dest); 72 saved_sign = getsign(dest);
82 73
83 if ( !(taga | tagb) ) 74 if (!(taga | tagb)) {
84 { 75 /* Both regs Valid, this should be the most common case. */
85 /* Both regs Valid, this should be the most common case. */ 76 reg_copy(a, &x);
86 reg_copy(a, &x); 77 reg_copy(b, &y);
87 reg_copy(b, &y); 78 setpositive(&x);
88 setpositive(&x); 79 setpositive(&y);
89 setpositive(&y); 80 tag = FPU_u_div(&x, &y, dest, control_w, sign);
90 tag = FPU_u_div(&x, &y, dest, control_w, sign);
91 81
92 if ( tag < 0 ) 82 if (tag < 0)
93 return tag; 83 return tag;
94 84
95 FPU_settagi(deststnr, tag); 85 FPU_settagi(deststnr, tag);
96 return tag; 86 return tag;
97 } 87 }
98 88
99 if ( taga == TAG_Special ) 89 if (taga == TAG_Special)
100 taga = FPU_Special(a); 90 taga = FPU_Special(a);
101 if ( tagb == TAG_Special ) 91 if (tagb == TAG_Special)
102 tagb = FPU_Special(b); 92 tagb = FPU_Special(b);
103 93
104 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 94 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
105 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 95 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
106 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 96 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
107 { 97 if (denormal_operand() < 0)
108 if ( denormal_operand() < 0 ) 98 return FPU_Exception;
109 return FPU_Exception; 99
110 100 FPU_to_exp16(a, &x);
111 FPU_to_exp16(a, &x); 101 FPU_to_exp16(b, &y);
112 FPU_to_exp16(b, &y); 102 tag = FPU_u_div(&x, &y, dest, control_w, sign);
113 tag = FPU_u_div(&x, &y, dest, control_w, sign); 103 if (tag < 0)
114 if ( tag < 0 ) 104 return tag;
115 return tag; 105
116 106 FPU_settagi(deststnr, tag);
117 FPU_settagi(deststnr, tag); 107 return tag;
118 return tag; 108 } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
119 } 109 if (tagb != TAG_Zero) {
120 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) 110 /* Want to find Zero/Valid */
121 { 111 if (tagb == TW_Denormal) {
122 if ( tagb != TAG_Zero ) 112 if (denormal_operand() < 0)
123 { 113 return FPU_Exception;
124 /* Want to find Zero/Valid */ 114 }
125 if ( tagb == TW_Denormal ) 115
126 { 116 /* The result is zero. */
127 if ( denormal_operand() < 0 ) 117 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
128 return FPU_Exception; 118 setsign(dest, sign);
129 } 119 return TAG_Zero;
130 120 }
131 /* The result is zero. */ 121 /* We have an exception condition, either 0/0 or Valid/Zero. */
132 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 122 if (taga == TAG_Zero) {
133 setsign(dest, sign); 123 /* 0/0 */
134 return TAG_Zero; 124 return arith_invalid(deststnr);
125 }
126 /* Valid/Zero */
127 return FPU_divide_by_zero(deststnr, sign);
135 } 128 }
136 /* We have an exception condition, either 0/0 or Valid/Zero. */ 129 /* Must have infinities, NaNs, etc */
137 if ( taga == TAG_Zero ) 130 else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
138 { 131 if (flags & LOADED)
139 /* 0/0 */ 132 return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0,
140 return arith_invalid(deststnr); 133 st0_ptr);
134
135 if (flags & DEST_RM) {
136 int tag;
137 tag = FPU_gettag0();
138 if (tag == TAG_Special)
139 tag = FPU_Special(st0_ptr);
140 return real_2op_NaN(st0_ptr, tag, rm,
141 (flags & REV) ? st0_ptr : &st(rm));
142 } else {
143 int tag;
144 tag = FPU_gettagi(rm);
145 if (tag == TAG_Special)
146 tag = FPU_Special(&st(rm));
147 return real_2op_NaN(&st(rm), tag, 0,
148 (flags & REV) ? st0_ptr : &st(rm));
149 }
150 } else if (taga == TW_Infinity) {
151 if (tagb == TW_Infinity) {
152 /* infinity/infinity */
153 return arith_invalid(deststnr);
154 } else {
155 /* tagb must be Valid or Zero */
156 if ((tagb == TW_Denormal) && (denormal_operand() < 0))
157 return FPU_Exception;
158
159 /* Infinity divided by Zero or Valid does
160 not raise and exception, but returns Infinity */
161 FPU_copy_to_regi(a, TAG_Special, deststnr);
162 setsign(dest, sign);
163 return taga;
164 }
165 } else if (tagb == TW_Infinity) {
166 if ((taga == TW_Denormal) && (denormal_operand() < 0))
167 return FPU_Exception;
168
169 /* The result is zero. */
170 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
171 setsign(dest, sign);
172 return TAG_Zero;
141 } 173 }
142 /* Valid/Zero */
143 return FPU_divide_by_zero(deststnr, sign);
144 }
145 /* Must have infinities, NaNs, etc */
146 else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
147 {
148 if ( flags & LOADED )
149 return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr);
150
151 if ( flags & DEST_RM )
152 {
153 int tag;
154 tag = FPU_gettag0();
155 if ( tag == TAG_Special )
156 tag = FPU_Special(st0_ptr);
157 return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm));
158 }
159 else
160 {
161 int tag;
162 tag = FPU_gettagi(rm);
163 if ( tag == TAG_Special )
164 tag = FPU_Special(&st(rm));
165 return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm));
166 }
167 }
168 else if (taga == TW_Infinity)
169 {
170 if (tagb == TW_Infinity)
171 {
172 /* infinity/infinity */
173 return arith_invalid(deststnr);
174 }
175 else
176 {
177 /* tagb must be Valid or Zero */
178 if ( (tagb == TW_Denormal) && (denormal_operand() < 0) )
179 return FPU_Exception;
180
181 /* Infinity divided by Zero or Valid does
182 not raise and exception, but returns Infinity */
183 FPU_copy_to_regi(a, TAG_Special, deststnr);
184 setsign(dest, sign);
185 return taga;
186 }
187 }
188 else if (tagb == TW_Infinity)
189 {
190 if ( (taga == TW_Denormal) && (denormal_operand() < 0) )
191 return FPU_Exception;
192
193 /* The result is zero. */
194 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
195 setsign(dest, sign);
196 return TAG_Zero;
197 }
198#ifdef PARANOID 174#ifdef PARANOID
199 else 175 else {
200 { 176 EXCEPTION(EX_INTERNAL | 0x102);
201 EXCEPTION(EX_INTERNAL|0x102); 177 return FPU_Exception;
202 return FPU_Exception; 178 }
203 } 179#endif /* PARANOID */
204#endif /* PARANOID */
205 180
206 return 0; 181 return 0;
207} 182}
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index e976caef6498..799d4af5be66 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -27,1084 +27,938 @@
27#include "control_w.h" 27#include "control_w.h"
28#include "status_w.h" 28#include "status_w.h"
29 29
30 30#define DOUBLE_Emax 1023 /* largest valid exponent */
31#define DOUBLE_Emax 1023 /* largest valid exponent */
32#define DOUBLE_Ebias 1023 31#define DOUBLE_Ebias 1023
33#define DOUBLE_Emin (-1022) /* smallest valid exponent */ 32#define DOUBLE_Emin (-1022) /* smallest valid exponent */
34 33
35#define SINGLE_Emax 127 /* largest valid exponent */ 34#define SINGLE_Emax 127 /* largest valid exponent */
36#define SINGLE_Ebias 127 35#define SINGLE_Ebias 127
37#define SINGLE_Emin (-126) /* smallest valid exponent */ 36#define SINGLE_Emin (-126) /* smallest valid exponent */
38
39 37
40static u_char normalize_no_excep(FPU_REG *r, int exp, int sign) 38static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
41{ 39{
42 u_char tag; 40 u_char tag;
43 41
44 setexponent16(r, exp); 42 setexponent16(r, exp);
45 43
46 tag = FPU_normalize_nuo(r); 44 tag = FPU_normalize_nuo(r);
47 stdexp(r); 45 stdexp(r);
48 if ( sign ) 46 if (sign)
49 setnegative(r); 47 setnegative(r);
50 48
51 return tag; 49 return tag;
52} 50}
53 51
54
55int FPU_tagof(FPU_REG *ptr) 52int FPU_tagof(FPU_REG *ptr)
56{ 53{
57 int exp; 54 int exp;
58 55
59 exp = exponent16(ptr) & 0x7fff; 56 exp = exponent16(ptr) & 0x7fff;
60 if ( exp == 0 ) 57 if (exp == 0) {
61 { 58 if (!(ptr->sigh | ptr->sigl)) {
62 if ( !(ptr->sigh | ptr->sigl) ) 59 return TAG_Zero;
63 { 60 }
64 return TAG_Zero; 61 /* The number is a de-normal or pseudodenormal. */
62 return TAG_Special;
63 }
64
65 if (exp == 0x7fff) {
66 /* Is an Infinity, a NaN, or an unsupported data type. */
67 return TAG_Special;
65 } 68 }
66 /* The number is a de-normal or pseudodenormal. */
67 return TAG_Special;
68 }
69
70 if ( exp == 0x7fff )
71 {
72 /* Is an Infinity, a NaN, or an unsupported data type. */
73 return TAG_Special;
74 }
75
76 if ( !(ptr->sigh & 0x80000000) )
77 {
78 /* Unsupported data type. */
79 /* Valid numbers have the ms bit set to 1. */
80 /* Unnormal. */
81 return TAG_Special;
82 }
83
84 return TAG_Valid;
85}
86 69
70 if (!(ptr->sigh & 0x80000000)) {
71 /* Unsupported data type. */
72 /* Valid numbers have the ms bit set to 1. */
73 /* Unnormal. */
74 return TAG_Special;
75 }
76
77 return TAG_Valid;
78}
87 79
88/* Get a long double from user memory */ 80/* Get a long double from user memory */
89int FPU_load_extended(long double __user *s, int stnr) 81int FPU_load_extended(long double __user *s, int stnr)
90{ 82{
91 FPU_REG *sti_ptr = &st(stnr); 83 FPU_REG *sti_ptr = &st(stnr);
92 84
93 RE_ENTRANT_CHECK_OFF; 85 RE_ENTRANT_CHECK_OFF;
94 FPU_access_ok(VERIFY_READ, s, 10); 86 FPU_access_ok(VERIFY_READ, s, 10);
95 __copy_from_user(sti_ptr, s, 10); 87 __copy_from_user(sti_ptr, s, 10);
96 RE_ENTRANT_CHECK_ON; 88 RE_ENTRANT_CHECK_ON;
97 89
98 return FPU_tagof(sti_ptr); 90 return FPU_tagof(sti_ptr);
99} 91}
100 92
101
102/* Get a double from user memory */ 93/* Get a double from user memory */
103int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) 94int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
104{ 95{
105 int exp, tag, negative; 96 int exp, tag, negative;
106 unsigned m64, l64; 97 unsigned m64, l64;
107 98
108 RE_ENTRANT_CHECK_OFF; 99 RE_ENTRANT_CHECK_OFF;
109 FPU_access_ok(VERIFY_READ, dfloat, 8); 100 FPU_access_ok(VERIFY_READ, dfloat, 8);
110 FPU_get_user(m64, 1 + (unsigned long __user *) dfloat); 101 FPU_get_user(m64, 1 + (unsigned long __user *)dfloat);
111 FPU_get_user(l64, (unsigned long __user *) dfloat); 102 FPU_get_user(l64, (unsigned long __user *)dfloat);
112 RE_ENTRANT_CHECK_ON; 103 RE_ENTRANT_CHECK_ON;
113 104
114 negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; 105 negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
115 exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; 106 exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
116 m64 &= 0xfffff; 107 m64 &= 0xfffff;
117 if ( exp > DOUBLE_Emax + EXTENDED_Ebias ) 108 if (exp > DOUBLE_Emax + EXTENDED_Ebias) {
118 { 109 /* Infinity or NaN */
119 /* Infinity or NaN */ 110 if ((m64 == 0) && (l64 == 0)) {
120 if ((m64 == 0) && (l64 == 0)) 111 /* +- infinity */
121 { 112 loaded_data->sigh = 0x80000000;
122 /* +- infinity */ 113 loaded_data->sigl = 0x00000000;
123 loaded_data->sigh = 0x80000000; 114 exp = EXP_Infinity + EXTENDED_Ebias;
124 loaded_data->sigl = 0x00000000; 115 tag = TAG_Special;
125 exp = EXP_Infinity + EXTENDED_Ebias; 116 } else {
126 tag = TAG_Special; 117 /* Must be a signaling or quiet NaN */
127 } 118 exp = EXP_NaN + EXTENDED_Ebias;
128 else 119 loaded_data->sigh = (m64 << 11) | 0x80000000;
129 { 120 loaded_data->sigh |= l64 >> 21;
130 /* Must be a signaling or quiet NaN */ 121 loaded_data->sigl = l64 << 11;
131 exp = EXP_NaN + EXTENDED_Ebias; 122 tag = TAG_Special; /* The calling function must look for NaNs */
132 loaded_data->sigh = (m64 << 11) | 0x80000000; 123 }
133 loaded_data->sigh |= l64 >> 21; 124 } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) {
134 loaded_data->sigl = l64 << 11; 125 /* Zero or de-normal */
135 tag = TAG_Special; /* The calling function must look for NaNs */ 126 if ((m64 == 0) && (l64 == 0)) {
136 } 127 /* Zero */
137 } 128 reg_copy(&CONST_Z, loaded_data);
138 else if ( exp < DOUBLE_Emin + EXTENDED_Ebias ) 129 exp = 0;
139 { 130 tag = TAG_Zero;
140 /* Zero or de-normal */ 131 } else {
141 if ((m64 == 0) && (l64 == 0)) 132 /* De-normal */
142 { 133 loaded_data->sigh = m64 << 11;
143 /* Zero */ 134 loaded_data->sigh |= l64 >> 21;
144 reg_copy(&CONST_Z, loaded_data); 135 loaded_data->sigl = l64 << 11;
145 exp = 0; 136
146 tag = TAG_Zero; 137 return normalize_no_excep(loaded_data, DOUBLE_Emin,
147 } 138 negative)
148 else 139 | (denormal_operand() < 0 ? FPU_Exception : 0);
149 { 140 }
150 /* De-normal */ 141 } else {
151 loaded_data->sigh = m64 << 11; 142 loaded_data->sigh = (m64 << 11) | 0x80000000;
152 loaded_data->sigh |= l64 >> 21; 143 loaded_data->sigh |= l64 >> 21;
153 loaded_data->sigl = l64 << 11; 144 loaded_data->sigl = l64 << 11;
154
155 return normalize_no_excep(loaded_data, DOUBLE_Emin, negative)
156 | (denormal_operand() < 0 ? FPU_Exception : 0);
157 }
158 }
159 else
160 {
161 loaded_data->sigh = (m64 << 11) | 0x80000000;
162 loaded_data->sigh |= l64 >> 21;
163 loaded_data->sigl = l64 << 11;
164 145
165 tag = TAG_Valid; 146 tag = TAG_Valid;
166 } 147 }
167 148
168 setexponent16(loaded_data, exp | negative); 149 setexponent16(loaded_data, exp | negative);
169 150
170 return tag; 151 return tag;
171} 152}
172 153
173
174/* Get a float from user memory */ 154/* Get a float from user memory */
175int FPU_load_single(float __user *single, FPU_REG *loaded_data) 155int FPU_load_single(float __user *single, FPU_REG *loaded_data)
176{ 156{
177 unsigned m32; 157 unsigned m32;
178 int exp, tag, negative; 158 int exp, tag, negative;
179 159
180 RE_ENTRANT_CHECK_OFF; 160 RE_ENTRANT_CHECK_OFF;
181 FPU_access_ok(VERIFY_READ, single, 4); 161 FPU_access_ok(VERIFY_READ, single, 4);
182 FPU_get_user(m32, (unsigned long __user *) single); 162 FPU_get_user(m32, (unsigned long __user *)single);
183 RE_ENTRANT_CHECK_ON; 163 RE_ENTRANT_CHECK_ON;
184 164
185 negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; 165 negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
186 166
187 if (!(m32 & 0x7fffffff)) 167 if (!(m32 & 0x7fffffff)) {
188 { 168 /* Zero */
189 /* Zero */ 169 reg_copy(&CONST_Z, loaded_data);
190 reg_copy(&CONST_Z, loaded_data); 170 addexponent(loaded_data, negative);
191 addexponent(loaded_data, negative); 171 return TAG_Zero;
192 return TAG_Zero;
193 }
194 exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
195 m32 = (m32 & 0x7fffff) << 8;
196 if ( exp < SINGLE_Emin + EXTENDED_Ebias )
197 {
198 /* De-normals */
199 loaded_data->sigh = m32;
200 loaded_data->sigl = 0;
201
202 return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
203 | (denormal_operand() < 0 ? FPU_Exception : 0);
204 }
205 else if ( exp > SINGLE_Emax + EXTENDED_Ebias )
206 {
207 /* Infinity or NaN */
208 if ( m32 == 0 )
209 {
210 /* +- infinity */
211 loaded_data->sigh = 0x80000000;
212 loaded_data->sigl = 0x00000000;
213 exp = EXP_Infinity + EXTENDED_Ebias;
214 tag = TAG_Special;
215 } 172 }
216 else 173 exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
217 { 174 m32 = (m32 & 0x7fffff) << 8;
218 /* Must be a signaling or quiet NaN */ 175 if (exp < SINGLE_Emin + EXTENDED_Ebias) {
219 exp = EXP_NaN + EXTENDED_Ebias; 176 /* De-normals */
220 loaded_data->sigh = m32 | 0x80000000; 177 loaded_data->sigh = m32;
221 loaded_data->sigl = 0; 178 loaded_data->sigl = 0;
222 tag = TAG_Special; /* The calling function must look for NaNs */ 179
180 return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
181 | (denormal_operand() < 0 ? FPU_Exception : 0);
182 } else if (exp > SINGLE_Emax + EXTENDED_Ebias) {
183 /* Infinity or NaN */
184 if (m32 == 0) {
185 /* +- infinity */
186 loaded_data->sigh = 0x80000000;
187 loaded_data->sigl = 0x00000000;
188 exp = EXP_Infinity + EXTENDED_Ebias;
189 tag = TAG_Special;
190 } else {
191 /* Must be a signaling or quiet NaN */
192 exp = EXP_NaN + EXTENDED_Ebias;
193 loaded_data->sigh = m32 | 0x80000000;
194 loaded_data->sigl = 0;
195 tag = TAG_Special; /* The calling function must look for NaNs */
196 }
197 } else {
198 loaded_data->sigh = m32 | 0x80000000;
199 loaded_data->sigl = 0;
200 tag = TAG_Valid;
223 } 201 }
224 }
225 else
226 {
227 loaded_data->sigh = m32 | 0x80000000;
228 loaded_data->sigl = 0;
229 tag = TAG_Valid;
230 }
231 202
232 setexponent16(loaded_data, exp | negative); /* Set the sign. */ 203 setexponent16(loaded_data, exp | negative); /* Set the sign. */
233 204
234 return tag; 205 return tag;
235} 206}
236 207
237
238/* Get a long long from user memory */ 208/* Get a long long from user memory */
239int FPU_load_int64(long long __user *_s) 209int FPU_load_int64(long long __user *_s)
240{ 210{
241 long long s; 211 long long s;
242 int sign; 212 int sign;
243 FPU_REG *st0_ptr = &st(0); 213 FPU_REG *st0_ptr = &st(0);
244 214
245 RE_ENTRANT_CHECK_OFF; 215 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8); 216 FPU_access_ok(VERIFY_READ, _s, 8);
247 if (copy_from_user(&s,_s,8)) 217 if (copy_from_user(&s, _s, 8))
248 FPU_abort; 218 FPU_abort;
249 RE_ENTRANT_CHECK_ON; 219 RE_ENTRANT_CHECK_ON;
250 220
251 if (s == 0) 221 if (s == 0) {
252 { 222 reg_copy(&CONST_Z, st0_ptr);
253 reg_copy(&CONST_Z, st0_ptr); 223 return TAG_Zero;
254 return TAG_Zero; 224 }
255 } 225
256 226 if (s > 0)
257 if (s > 0) 227 sign = SIGN_Positive;
258 sign = SIGN_Positive; 228 else {
259 else 229 s = -s;
260 { 230 sign = SIGN_Negative;
261 s = -s; 231 }
262 sign = SIGN_Negative;
263 }
264
265 significand(st0_ptr) = s;
266
267 return normalize_no_excep(st0_ptr, 63, sign);
268}
269 232
233 significand(st0_ptr) = s;
234
235 return normalize_no_excep(st0_ptr, 63, sign);
236}
270 237
271/* Get a long from user memory */ 238/* Get a long from user memory */
272int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) 239int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
273{ 240{
274 long s; 241 long s;
275 int negative; 242 int negative;
276 243
277 RE_ENTRANT_CHECK_OFF; 244 RE_ENTRANT_CHECK_OFF;
278 FPU_access_ok(VERIFY_READ, _s, 4); 245 FPU_access_ok(VERIFY_READ, _s, 4);
279 FPU_get_user(s, _s); 246 FPU_get_user(s, _s);
280 RE_ENTRANT_CHECK_ON; 247 RE_ENTRANT_CHECK_ON;
281 248
282 if (s == 0) 249 if (s == 0) {
283 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } 250 reg_copy(&CONST_Z, loaded_data);
251 return TAG_Zero;
252 }
284 253
285 if (s > 0) 254 if (s > 0)
286 negative = SIGN_Positive; 255 negative = SIGN_Positive;
287 else 256 else {
288 { 257 s = -s;
289 s = -s; 258 negative = SIGN_Negative;
290 negative = SIGN_Negative; 259 }
291 }
292 260
293 loaded_data->sigh = s; 261 loaded_data->sigh = s;
294 loaded_data->sigl = 0; 262 loaded_data->sigl = 0;
295 263
296 return normalize_no_excep(loaded_data, 31, negative); 264 return normalize_no_excep(loaded_data, 31, negative);
297} 265}
298 266
299
300/* Get a short from user memory */ 267/* Get a short from user memory */
301int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) 268int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
302{ 269{
303 int s, negative; 270 int s, negative;
304 271
305 RE_ENTRANT_CHECK_OFF; 272 RE_ENTRANT_CHECK_OFF;
306 FPU_access_ok(VERIFY_READ, _s, 2); 273 FPU_access_ok(VERIFY_READ, _s, 2);
307 /* Cast as short to get the sign extended. */ 274 /* Cast as short to get the sign extended. */
308 FPU_get_user(s, _s); 275 FPU_get_user(s, _s);
309 RE_ENTRANT_CHECK_ON; 276 RE_ENTRANT_CHECK_ON;
310 277
311 if (s == 0) 278 if (s == 0) {
312 { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } 279 reg_copy(&CONST_Z, loaded_data);
280 return TAG_Zero;
281 }
313 282
314 if (s > 0) 283 if (s > 0)
315 negative = SIGN_Positive; 284 negative = SIGN_Positive;
316 else 285 else {
317 { 286 s = -s;
318 s = -s; 287 negative = SIGN_Negative;
319 negative = SIGN_Negative; 288 }
320 }
321 289
322 loaded_data->sigh = s << 16; 290 loaded_data->sigh = s << 16;
323 loaded_data->sigl = 0; 291 loaded_data->sigl = 0;
324 292
325 return normalize_no_excep(loaded_data, 15, negative); 293 return normalize_no_excep(loaded_data, 15, negative);
326} 294}
327 295
328
329/* Get a packed bcd array from user memory */ 296/* Get a packed bcd array from user memory */
330int FPU_load_bcd(u_char __user *s) 297int FPU_load_bcd(u_char __user *s)
331{ 298{
332 FPU_REG *st0_ptr = &st(0); 299 FPU_REG *st0_ptr = &st(0);
333 int pos; 300 int pos;
334 u_char bcd; 301 u_char bcd;
335 long long l=0; 302 long long l = 0;
336 int sign; 303 int sign;
337 304
338 RE_ENTRANT_CHECK_OFF; 305 RE_ENTRANT_CHECK_OFF;
339 FPU_access_ok(VERIFY_READ, s, 10); 306 FPU_access_ok(VERIFY_READ, s, 10);
340 RE_ENTRANT_CHECK_ON; 307 RE_ENTRANT_CHECK_ON;
341 for ( pos = 8; pos >= 0; pos--) 308 for (pos = 8; pos >= 0; pos--) {
342 { 309 l *= 10;
343 l *= 10; 310 RE_ENTRANT_CHECK_OFF;
344 RE_ENTRANT_CHECK_OFF; 311 FPU_get_user(bcd, s + pos);
345 FPU_get_user(bcd, s+pos); 312 RE_ENTRANT_CHECK_ON;
346 RE_ENTRANT_CHECK_ON; 313 l += bcd >> 4;
347 l += bcd >> 4; 314 l *= 10;
348 l *= 10; 315 l += bcd & 0x0f;
349 l += bcd & 0x0f; 316 }
350 } 317
351 318 RE_ENTRANT_CHECK_OFF;
352 RE_ENTRANT_CHECK_OFF; 319 FPU_get_user(sign, s + 9);
353 FPU_get_user(sign, s+9); 320 sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
354 sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; 321 RE_ENTRANT_CHECK_ON;
355 RE_ENTRANT_CHECK_ON; 322
356 323 if (l == 0) {
357 if ( l == 0 ) 324 reg_copy(&CONST_Z, st0_ptr);
358 { 325 addexponent(st0_ptr, sign); /* Set the sign. */
359 reg_copy(&CONST_Z, st0_ptr); 326 return TAG_Zero;
360 addexponent(st0_ptr, sign); /* Set the sign. */ 327 } else {
361 return TAG_Zero; 328 significand(st0_ptr) = l;
362 } 329 return normalize_no_excep(st0_ptr, 63, sign);
363 else 330 }
364 {
365 significand(st0_ptr) = l;
366 return normalize_no_excep(st0_ptr, 63, sign);
367 }
368} 331}
369 332
370/*===========================================================================*/ 333/*===========================================================================*/
371 334
372/* Put a long double into user memory */ 335/* Put a long double into user memory */
373int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d) 336int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
337 long double __user * d)
374{ 338{
375 /* 339 /*
376 The only exception raised by an attempt to store to an 340 The only exception raised by an attempt to store to an
377 extended format is the Invalid Stack exception, i.e. 341 extended format is the Invalid Stack exception, i.e.
378 attempting to store from an empty register. 342 attempting to store from an empty register.
379 */ 343 */
380 344
381 if ( st0_tag != TAG_Empty ) 345 if (st0_tag != TAG_Empty) {
382 { 346 RE_ENTRANT_CHECK_OFF;
383 RE_ENTRANT_CHECK_OFF; 347 FPU_access_ok(VERIFY_WRITE, d, 10);
384 FPU_access_ok(VERIFY_WRITE, d, 10); 348
385 349 FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d);
386 FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d); 350 FPU_put_user(st0_ptr->sigh,
387 FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4)); 351 (unsigned long __user *)((u_char __user *) d + 4));
388 FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8)); 352 FPU_put_user(exponent16(st0_ptr),
389 RE_ENTRANT_CHECK_ON; 353 (unsigned short __user *)((u_char __user *) d +
390 354 8));
391 return 1; 355 RE_ENTRANT_CHECK_ON;
392 } 356
393 357 return 1;
394 /* Empty register (stack underflow) */ 358 }
395 EXCEPTION(EX_StackUnder);
396 if ( control_word & CW_Invalid )
397 {
398 /* The masked response */
399 /* Put out the QNaN indefinite */
400 RE_ENTRANT_CHECK_OFF;
401 FPU_access_ok(VERIFY_WRITE,d,10);
402 FPU_put_user(0, (unsigned long __user *) d);
403 FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d);
404 FPU_put_user(0xffff, 4 + (short __user *) d);
405 RE_ENTRANT_CHECK_ON;
406 return 1;
407 }
408 else
409 return 0;
410 359
411} 360 /* Empty register (stack underflow) */
361 EXCEPTION(EX_StackUnder);
362 if (control_word & CW_Invalid) {
363 /* The masked response */
364 /* Put out the QNaN indefinite */
365 RE_ENTRANT_CHECK_OFF;
366 FPU_access_ok(VERIFY_WRITE, d, 10);
367 FPU_put_user(0, (unsigned long __user *)d);
368 FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d);
369 FPU_put_user(0xffff, 4 + (short __user *)d);
370 RE_ENTRANT_CHECK_ON;
371 return 1;
372 } else
373 return 0;
412 374
375}
413 376
414/* Put a double into user memory */ 377/* Put a double into user memory */
415int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat) 378int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
416{ 379{
417 unsigned long l[2]; 380 unsigned long l[2];
418 unsigned long increment = 0; /* avoid gcc warnings */ 381 unsigned long increment = 0; /* avoid gcc warnings */
419 int precision_loss; 382 int precision_loss;
420 int exp; 383 int exp;
421 FPU_REG tmp; 384 FPU_REG tmp;
422 385
423 if ( st0_tag == TAG_Valid ) 386 if (st0_tag == TAG_Valid) {
424 { 387 reg_copy(st0_ptr, &tmp);
425 reg_copy(st0_ptr, &tmp); 388 exp = exponent(&tmp);
426 exp = exponent(&tmp);
427 389
428 if ( exp < DOUBLE_Emin ) /* It may be a denormal */ 390 if (exp < DOUBLE_Emin) { /* It may be a denormal */
429 { 391 addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
430 addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
431 392
432 denormal_arg: 393 denormal_arg:
433 394
434 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) 395 if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
435 {
436#ifdef PECULIAR_486 396#ifdef PECULIAR_486
437 /* Did it round to a non-denormal ? */ 397 /* Did it round to a non-denormal ? */
438 /* This behaviour might be regarded as peculiar, it appears 398 /* This behaviour might be regarded as peculiar, it appears
439 that the 80486 rounds to the dest precision, then 399 that the 80486 rounds to the dest precision, then
440 converts to decide underflow. */ 400 converts to decide underflow. */
441 if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) && 401 if (!
442 (st0_ptr->sigl & 0x000007ff)) ) 402 ((tmp.sigh == 0x00100000) && (tmp.sigl == 0)
403 && (st0_ptr->sigl & 0x000007ff)))
443#endif /* PECULIAR_486 */ 404#endif /* PECULIAR_486 */
444 { 405 {
445 EXCEPTION(EX_Underflow); 406 EXCEPTION(EX_Underflow);
446 /* This is a special case: see sec 16.2.5.1 of 407 /* This is a special case: see sec 16.2.5.1 of
447 the 80486 book */ 408 the 80486 book */
448 if ( !(control_word & CW_Underflow) ) 409 if (!(control_word & CW_Underflow))
449 return 0; 410 return 0;
450 } 411 }
451 EXCEPTION(precision_loss); 412 EXCEPTION(precision_loss);
452 if ( !(control_word & CW_Precision) ) 413 if (!(control_word & CW_Precision))
453 return 0; 414 return 0;
454 }
455 l[0] = tmp.sigl;
456 l[1] = tmp.sigh;
457 }
458 else
459 {
460 if ( tmp.sigl & 0x000007ff )
461 {
462 precision_loss = 1;
463 switch (control_word & CW_RC)
464 {
465 case RC_RND:
466 /* Rounding can get a little messy.. */
467 increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
468 ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
469 break;
470 case RC_DOWN: /* towards -infinity */
471 increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff;
472 break;
473 case RC_UP: /* towards +infinity */
474 increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0;
475 break;
476 case RC_CHOP:
477 increment = 0;
478 break;
479 }
480
481 /* Truncate the mantissa */
482 tmp.sigl &= 0xfffff800;
483
484 if ( increment )
485 {
486 if ( tmp.sigl >= 0xfffff800 )
487 {
488 /* the sigl part overflows */
489 if ( tmp.sigh == 0xffffffff )
490 {
491 /* The sigh part overflows */
492 tmp.sigh = 0x80000000;
493 exp++;
494 if (exp >= EXP_OVER)
495 goto overflow;
496 } 415 }
497 else 416 l[0] = tmp.sigl;
498 { 417 l[1] = tmp.sigh;
499 tmp.sigh ++; 418 } else {
419 if (tmp.sigl & 0x000007ff) {
420 precision_loss = 1;
421 switch (control_word & CW_RC) {
422 case RC_RND:
423 /* Rounding can get a little messy.. */
424 increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
425 ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
426 break;
427 case RC_DOWN: /* towards -infinity */
428 increment =
429 signpositive(&tmp) ? 0 : tmp.
430 sigl & 0x7ff;
431 break;
432 case RC_UP: /* towards +infinity */
433 increment =
434 signpositive(&tmp) ? tmp.
435 sigl & 0x7ff : 0;
436 break;
437 case RC_CHOP:
438 increment = 0;
439 break;
440 }
441
442 /* Truncate the mantissa */
443 tmp.sigl &= 0xfffff800;
444
445 if (increment) {
446 if (tmp.sigl >= 0xfffff800) {
447 /* the sigl part overflows */
448 if (tmp.sigh == 0xffffffff) {
449 /* The sigh part overflows */
450 tmp.sigh = 0x80000000;
451 exp++;
452 if (exp >= EXP_OVER)
453 goto overflow;
454 } else {
455 tmp.sigh++;
456 }
457 tmp.sigl = 0x00000000;
458 } else {
459 /* We only need to increment sigl */
460 tmp.sigl += 0x00000800;
461 }
462 }
463 } else
464 precision_loss = 0;
465
466 l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
467 l[1] = ((tmp.sigh >> 11) & 0xfffff);
468
469 if (exp > DOUBLE_Emax) {
470 overflow:
471 EXCEPTION(EX_Overflow);
472 if (!(control_word & CW_Overflow))
473 return 0;
474 set_precision_flag_up();
475 if (!(control_word & CW_Precision))
476 return 0;
477
478 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
479 /* Overflow to infinity */
480 l[0] = 0x00000000; /* Set to */
481 l[1] = 0x7ff00000; /* + INF */
482 } else {
483 if (precision_loss) {
484 if (increment)
485 set_precision_flag_up();
486 else
487 set_precision_flag_down();
488 }
489 /* Add the exponent */
490 l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20);
500 } 491 }
501 tmp.sigl = 0x00000000;
502 }
503 else
504 {
505 /* We only need to increment sigl */
506 tmp.sigl += 0x00000800;
507 }
508 }
509 }
510 else
511 precision_loss = 0;
512
513 l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
514 l[1] = ((tmp.sigh >> 11) & 0xfffff);
515
516 if ( exp > DOUBLE_Emax )
517 {
518 overflow:
519 EXCEPTION(EX_Overflow);
520 if ( !(control_word & CW_Overflow) )
521 return 0;
522 set_precision_flag_up();
523 if ( !(control_word & CW_Precision) )
524 return 0;
525
526 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
527 /* Overflow to infinity */
528 l[0] = 0x00000000; /* Set to */
529 l[1] = 0x7ff00000; /* + INF */
530 }
531 else
532 {
533 if ( precision_loss )
534 {
535 if ( increment )
536 set_precision_flag_up();
537 else
538 set_precision_flag_down();
539 } 492 }
540 /* Add the exponent */ 493 } else if (st0_tag == TAG_Zero) {
541 l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20); 494 /* Number is zero */
542 } 495 l[0] = 0;
543 } 496 l[1] = 0;
544 } 497 } else if (st0_tag == TAG_Special) {
545 else if (st0_tag == TAG_Zero) 498 st0_tag = FPU_Special(st0_ptr);
546 { 499 if (st0_tag == TW_Denormal) {
547 /* Number is zero */ 500 /* A denormal will always underflow. */
548 l[0] = 0;
549 l[1] = 0;
550 }
551 else if ( st0_tag == TAG_Special )
552 {
553 st0_tag = FPU_Special(st0_ptr);
554 if ( st0_tag == TW_Denormal )
555 {
556 /* A denormal will always underflow. */
557#ifndef PECULIAR_486 501#ifndef PECULIAR_486
558 /* An 80486 is supposed to be able to generate 502 /* An 80486 is supposed to be able to generate
559 a denormal exception here, but... */ 503 a denormal exception here, but... */
560 /* Underflow has priority. */ 504 /* Underflow has priority. */
561 if ( control_word & CW_Underflow ) 505 if (control_word & CW_Underflow)
562 denormal_operand(); 506 denormal_operand();
563#endif /* PECULIAR_486 */ 507#endif /* PECULIAR_486 */
564 reg_copy(st0_ptr, &tmp); 508 reg_copy(st0_ptr, &tmp);
565 goto denormal_arg; 509 goto denormal_arg;
566 } 510 } else if (st0_tag == TW_Infinity) {
567 else if (st0_tag == TW_Infinity) 511 l[0] = 0;
568 { 512 l[1] = 0x7ff00000;
569 l[0] = 0; 513 } else if (st0_tag == TW_NaN) {
570 l[1] = 0x7ff00000; 514 /* Is it really a NaN ? */
571 } 515 if ((exponent(st0_ptr) == EXP_OVER)
572 else if (st0_tag == TW_NaN) 516 && (st0_ptr->sigh & 0x80000000)) {
573 { 517 /* See if we can get a valid NaN from the FPU_REG */
574 /* Is it really a NaN ? */ 518 l[0] =
575 if ( (exponent(st0_ptr) == EXP_OVER) 519 (st0_ptr->sigl >> 11) | (st0_ptr->
576 && (st0_ptr->sigh & 0x80000000) ) 520 sigh << 21);
577 { 521 l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
578 /* See if we can get a valid NaN from the FPU_REG */ 522 if (!(st0_ptr->sigh & 0x40000000)) {
579 l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21); 523 /* It is a signalling NaN */
580 l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); 524 EXCEPTION(EX_Invalid);
581 if ( !(st0_ptr->sigh & 0x40000000) ) 525 if (!(control_word & CW_Invalid))
582 { 526 return 0;
583 /* It is a signalling NaN */ 527 l[1] |= (0x40000000 >> 11);
584 EXCEPTION(EX_Invalid); 528 }
585 if ( !(control_word & CW_Invalid) ) 529 l[1] |= 0x7ff00000;
586 return 0; 530 } else {
587 l[1] |= (0x40000000 >> 11); 531 /* It is an unsupported data type */
532 EXCEPTION(EX_Invalid);
533 if (!(control_word & CW_Invalid))
534 return 0;
535 l[0] = 0;
536 l[1] = 0xfff80000;
537 }
588 } 538 }
589 l[1] |= 0x7ff00000; 539 } else if (st0_tag == TAG_Empty) {
590 } 540 /* Empty register (stack underflow) */
591 else 541 EXCEPTION(EX_StackUnder);
592 { 542 if (control_word & CW_Invalid) {
593 /* It is an unsupported data type */ 543 /* The masked response */
594 EXCEPTION(EX_Invalid); 544 /* Put out the QNaN indefinite */
595 if ( !(control_word & CW_Invalid) ) 545 RE_ENTRANT_CHECK_OFF;
596 return 0; 546 FPU_access_ok(VERIFY_WRITE, dfloat, 8);
597 l[0] = 0; 547 FPU_put_user(0, (unsigned long __user *)dfloat);
598 l[1] = 0xfff80000; 548 FPU_put_user(0xfff80000,
599 } 549 1 + (unsigned long __user *)dfloat);
550 RE_ENTRANT_CHECK_ON;
551 return 1;
552 } else
553 return 0;
600 } 554 }
601 } 555 if (getsign(st0_ptr))
602 else if ( st0_tag == TAG_Empty ) 556 l[1] |= 0x80000000;
603 {
604 /* Empty register (stack underflow) */
605 EXCEPTION(EX_StackUnder);
606 if ( control_word & CW_Invalid )
607 {
608 /* The masked response */
609 /* Put out the QNaN indefinite */
610 RE_ENTRANT_CHECK_OFF;
611 FPU_access_ok(VERIFY_WRITE,dfloat,8);
612 FPU_put_user(0, (unsigned long __user *) dfloat);
613 FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat);
614 RE_ENTRANT_CHECK_ON;
615 return 1;
616 }
617 else
618 return 0;
619 }
620 if ( getsign(st0_ptr) )
621 l[1] |= 0x80000000;
622
623 RE_ENTRANT_CHECK_OFF;
624 FPU_access_ok(VERIFY_WRITE,dfloat,8);
625 FPU_put_user(l[0], (unsigned long __user *)dfloat);
626 FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
627 RE_ENTRANT_CHECK_ON;
628
629 return 1;
630}
631 557
558 RE_ENTRANT_CHECK_OFF;
559 FPU_access_ok(VERIFY_WRITE, dfloat, 8);
560 FPU_put_user(l[0], (unsigned long __user *)dfloat);
561 FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
562 RE_ENTRANT_CHECK_ON;
563
564 return 1;
565}
632 566
633/* Put a float into user memory */ 567/* Put a float into user memory */
634int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) 568int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
635{ 569{
636 long templ = 0; 570 long templ = 0;
637 unsigned long increment = 0; /* avoid gcc warnings */ 571 unsigned long increment = 0; /* avoid gcc warnings */
638 int precision_loss; 572 int precision_loss;
639 int exp; 573 int exp;
640 FPU_REG tmp; 574 FPU_REG tmp;
641 575
642 if ( st0_tag == TAG_Valid ) 576 if (st0_tag == TAG_Valid) {
643 {
644 577
645 reg_copy(st0_ptr, &tmp); 578 reg_copy(st0_ptr, &tmp);
646 exp = exponent(&tmp); 579 exp = exponent(&tmp);
647 580
648 if ( exp < SINGLE_Emin ) 581 if (exp < SINGLE_Emin) {
649 { 582 addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
650 addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
651 583
652 denormal_arg: 584 denormal_arg:
653 585
654 if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) 586 if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
655 {
656#ifdef PECULIAR_486 587#ifdef PECULIAR_486
657 /* Did it round to a non-denormal ? */ 588 /* Did it round to a non-denormal ? */
658 /* This behaviour might be regarded as peculiar, it appears 589 /* This behaviour might be regarded as peculiar, it appears
659 that the 80486 rounds to the dest precision, then 590 that the 80486 rounds to the dest precision, then
660 converts to decide underflow. */ 591 converts to decide underflow. */
661 if ( !((tmp.sigl == 0x00800000) && 592 if (!((tmp.sigl == 0x00800000) &&
662 ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) ) 593 ((st0_ptr->sigh & 0x000000ff)
594 || st0_ptr->sigl)))
663#endif /* PECULIAR_486 */ 595#endif /* PECULIAR_486 */
664 { 596 {
665 EXCEPTION(EX_Underflow); 597 EXCEPTION(EX_Underflow);
666 /* This is a special case: see sec 16.2.5.1 of 598 /* This is a special case: see sec 16.2.5.1 of
667 the 80486 book */ 599 the 80486 book */
668 if ( !(control_word & CW_Underflow) ) 600 if (!(control_word & CW_Underflow))
669 return 0; 601 return 0;
670 } 602 }
671 EXCEPTION(precision_loss); 603 EXCEPTION(precision_loss);
672 if ( !(control_word & CW_Precision) ) 604 if (!(control_word & CW_Precision))
673 return 0; 605 return 0;
674 } 606 }
675 templ = tmp.sigl; 607 templ = tmp.sigl;
676 } 608 } else {
677 else 609 if (tmp.sigl | (tmp.sigh & 0x000000ff)) {
678 { 610 unsigned long sigh = tmp.sigh;
679 if ( tmp.sigl | (tmp.sigh & 0x000000ff) ) 611 unsigned long sigl = tmp.sigl;
680 { 612
681 unsigned long sigh = tmp.sigh; 613 precision_loss = 1;
682 unsigned long sigl = tmp.sigl; 614 switch (control_word & CW_RC) {
683 615 case RC_RND:
684 precision_loss = 1; 616 increment = ((sigh & 0xff) > 0x80) /* more than half */
685 switch (control_word & CW_RC) 617 ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */
686 { 618 ||((sigh & 0x180) == 0x180); /* round to even */
687 case RC_RND: 619 break;
688 increment = ((sigh & 0xff) > 0x80) /* more than half */ 620 case RC_DOWN: /* towards -infinity */
689 || (((sigh & 0xff) == 0x80) && sigl) /* more than half */ 621 increment = signpositive(&tmp)
690 || ((sigh & 0x180) == 0x180); /* round to even */ 622 ? 0 : (sigl | (sigh & 0xff));
691 break; 623 break;
692 case RC_DOWN: /* towards -infinity */ 624 case RC_UP: /* towards +infinity */
693 increment = signpositive(&tmp) 625 increment = signpositive(&tmp)
694 ? 0 : (sigl | (sigh & 0xff)); 626 ? (sigl | (sigh & 0xff)) : 0;
695 break; 627 break;
696 case RC_UP: /* towards +infinity */ 628 case RC_CHOP:
697 increment = signpositive(&tmp) 629 increment = 0;
698 ? (sigl | (sigh & 0xff)) : 0; 630 break;
699 break; 631 }
700 case RC_CHOP: 632
701 increment = 0; 633 /* Truncate part of the mantissa */
702 break; 634 tmp.sigl = 0;
703 } 635
704 636 if (increment) {
705 /* Truncate part of the mantissa */ 637 if (sigh >= 0xffffff00) {
706 tmp.sigl = 0; 638 /* The sigh part overflows */
707 639 tmp.sigh = 0x80000000;
708 if (increment) 640 exp++;
709 { 641 if (exp >= EXP_OVER)
710 if ( sigh >= 0xffffff00 ) 642 goto overflow;
711 { 643 } else {
712 /* The sigh part overflows */ 644 tmp.sigh &= 0xffffff00;
713 tmp.sigh = 0x80000000; 645 tmp.sigh += 0x100;
714 exp++; 646 }
715 if ( exp >= EXP_OVER ) 647 } else {
716 goto overflow; 648 tmp.sigh &= 0xffffff00; /* Finish the truncation */
717 } 649 }
718 else 650 } else
719 { 651 precision_loss = 0;
720 tmp.sigh &= 0xffffff00; 652
721 tmp.sigh += 0x100; 653 templ = (tmp.sigh >> 8) & 0x007fffff;
722 } 654
723 } 655 if (exp > SINGLE_Emax) {
724 else 656 overflow:
725 { 657 EXCEPTION(EX_Overflow);
726 tmp.sigh &= 0xffffff00; /* Finish the truncation */ 658 if (!(control_word & CW_Overflow))
727 } 659 return 0;
728 } 660 set_precision_flag_up();
729 else 661 if (!(control_word & CW_Precision))
730 precision_loss = 0; 662 return 0;
731 663
732 templ = (tmp.sigh >> 8) & 0x007fffff; 664 /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
733 665 /* Masked response is overflow to infinity. */
734 if ( exp > SINGLE_Emax ) 666 templ = 0x7f800000;
735 { 667 } else {
736 overflow: 668 if (precision_loss) {
737 EXCEPTION(EX_Overflow); 669 if (increment)
738 if ( !(control_word & CW_Overflow) ) 670 set_precision_flag_up();
739 return 0; 671 else
740 set_precision_flag_up(); 672 set_precision_flag_down();
741 if ( !(control_word & CW_Precision) ) 673 }
742 return 0; 674 /* Add the exponent */
743 675 templ |= ((exp + SINGLE_Ebias) & 0xff) << 23;
744 /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ 676 }
745 /* Masked response is overflow to infinity. */
746 templ = 0x7f800000;
747 }
748 else
749 {
750 if ( precision_loss )
751 {
752 if ( increment )
753 set_precision_flag_up();
754 else
755 set_precision_flag_down();
756 } 677 }
757 /* Add the exponent */ 678 } else if (st0_tag == TAG_Zero) {
758 templ |= ((exp+SINGLE_Ebias) & 0xff) << 23; 679 templ = 0;
759 } 680 } else if (st0_tag == TAG_Special) {
760 } 681 st0_tag = FPU_Special(st0_ptr);
761 } 682 if (st0_tag == TW_Denormal) {
762 else if (st0_tag == TAG_Zero) 683 reg_copy(st0_ptr, &tmp);
763 { 684
764 templ = 0; 685 /* A denormal will always underflow. */
765 }
766 else if ( st0_tag == TAG_Special )
767 {
768 st0_tag = FPU_Special(st0_ptr);
769 if (st0_tag == TW_Denormal)
770 {
771 reg_copy(st0_ptr, &tmp);
772
773 /* A denormal will always underflow. */
774#ifndef PECULIAR_486 686#ifndef PECULIAR_486
775 /* An 80486 is supposed to be able to generate 687 /* An 80486 is supposed to be able to generate
776 a denormal exception here, but... */ 688 a denormal exception here, but... */
777 /* Underflow has priority. */ 689 /* Underflow has priority. */
778 if ( control_word & CW_Underflow ) 690 if (control_word & CW_Underflow)
779 denormal_operand(); 691 denormal_operand();
780#endif /* PECULIAR_486 */ 692#endif /* PECULIAR_486 */
781 goto denormal_arg; 693 goto denormal_arg;
782 } 694 } else if (st0_tag == TW_Infinity) {
783 else if (st0_tag == TW_Infinity) 695 templ = 0x7f800000;
784 { 696 } else if (st0_tag == TW_NaN) {
785 templ = 0x7f800000; 697 /* Is it really a NaN ? */
786 } 698 if ((exponent(st0_ptr) == EXP_OVER)
787 else if (st0_tag == TW_NaN) 699 && (st0_ptr->sigh & 0x80000000)) {
788 { 700 /* See if we can get a valid NaN from the FPU_REG */
789 /* Is it really a NaN ? */ 701 templ = st0_ptr->sigh >> 8;
790 if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) ) 702 if (!(st0_ptr->sigh & 0x40000000)) {
791 { 703 /* It is a signalling NaN */
792 /* See if we can get a valid NaN from the FPU_REG */ 704 EXCEPTION(EX_Invalid);
793 templ = st0_ptr->sigh >> 8; 705 if (!(control_word & CW_Invalid))
794 if ( !(st0_ptr->sigh & 0x40000000) ) 706 return 0;
795 { 707 templ |= (0x40000000 >> 8);
796 /* It is a signalling NaN */ 708 }
797 EXCEPTION(EX_Invalid); 709 templ |= 0x7f800000;
798 if ( !(control_word & CW_Invalid) ) 710 } else {
799 return 0; 711 /* It is an unsupported data type */
800 templ |= (0x40000000 >> 8); 712 EXCEPTION(EX_Invalid);
713 if (!(control_word & CW_Invalid))
714 return 0;
715 templ = 0xffc00000;
716 }
801 } 717 }
802 templ |= 0x7f800000;
803 }
804 else
805 {
806 /* It is an unsupported data type */
807 EXCEPTION(EX_Invalid);
808 if ( !(control_word & CW_Invalid) )
809 return 0;
810 templ = 0xffc00000;
811 }
812 }
813#ifdef PARANOID 718#ifdef PARANOID
814 else 719 else {
815 { 720 EXCEPTION(EX_INTERNAL | 0x164);
816 EXCEPTION(EX_INTERNAL|0x164); 721 return 0;
817 return 0; 722 }
818 }
819#endif 723#endif
820 } 724 } else if (st0_tag == TAG_Empty) {
821 else if ( st0_tag == TAG_Empty ) 725 /* Empty register (stack underflow) */
822 { 726 EXCEPTION(EX_StackUnder);
823 /* Empty register (stack underflow) */ 727 if (control_word & EX_Invalid) {
824 EXCEPTION(EX_StackUnder); 728 /* The masked response */
825 if ( control_word & EX_Invalid ) 729 /* Put out the QNaN indefinite */
826 { 730 RE_ENTRANT_CHECK_OFF;
827 /* The masked response */ 731 FPU_access_ok(VERIFY_WRITE, single, 4);
828 /* Put out the QNaN indefinite */ 732 FPU_put_user(0xffc00000,
829 RE_ENTRANT_CHECK_OFF; 733 (unsigned long __user *)single);
830 FPU_access_ok(VERIFY_WRITE,single,4); 734 RE_ENTRANT_CHECK_ON;
831 FPU_put_user(0xffc00000, (unsigned long __user *) single); 735 return 1;
832 RE_ENTRANT_CHECK_ON; 736 } else
833 return 1; 737 return 0;
834 } 738 }
835 else
836 return 0;
837 }
838#ifdef PARANOID 739#ifdef PARANOID
839 else 740 else {
840 { 741 EXCEPTION(EX_INTERNAL | 0x163);
841 EXCEPTION(EX_INTERNAL|0x163); 742 return 0;
842 return 0; 743 }
843 }
844#endif 744#endif
845 if ( getsign(st0_ptr) ) 745 if (getsign(st0_ptr))
846 templ |= 0x80000000; 746 templ |= 0x80000000;
847 747
848 RE_ENTRANT_CHECK_OFF; 748 RE_ENTRANT_CHECK_OFF;
849 FPU_access_ok(VERIFY_WRITE,single,4); 749 FPU_access_ok(VERIFY_WRITE, single, 4);
850 FPU_put_user(templ,(unsigned long __user *) single); 750 FPU_put_user(templ, (unsigned long __user *)single);
851 RE_ENTRANT_CHECK_ON; 751 RE_ENTRANT_CHECK_ON;
852 752
853 return 1; 753 return 1;
854} 754}
855 755
856
857/* Put a long long into user memory */ 756/* Put a long long into user memory */
858int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) 757int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
859{ 758{
860 FPU_REG t; 759 FPU_REG t;
861 long long tll; 760 long long tll;
862 int precision_loss; 761 int precision_loss;
863 762
864 if ( st0_tag == TAG_Empty ) 763 if (st0_tag == TAG_Empty) {
865 { 764 /* Empty register (stack underflow) */
866 /* Empty register (stack underflow) */ 765 EXCEPTION(EX_StackUnder);
867 EXCEPTION(EX_StackUnder); 766 goto invalid_operand;
868 goto invalid_operand; 767 } else if (st0_tag == TAG_Special) {
869 } 768 st0_tag = FPU_Special(st0_ptr);
870 else if ( st0_tag == TAG_Special ) 769 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
871 { 770 EXCEPTION(EX_Invalid);
872 st0_tag = FPU_Special(st0_ptr); 771 goto invalid_operand;
873 if ( (st0_tag == TW_Infinity) || 772 }
874 (st0_tag == TW_NaN) )
875 {
876 EXCEPTION(EX_Invalid);
877 goto invalid_operand;
878 } 773 }
879 } 774
880 775 reg_copy(st0_ptr, &t);
881 reg_copy(st0_ptr, &t); 776 precision_loss = FPU_round_to_int(&t, st0_tag);
882 precision_loss = FPU_round_to_int(&t, st0_tag); 777 ((long *)&tll)[0] = t.sigl;
883 ((long *)&tll)[0] = t.sigl; 778 ((long *)&tll)[1] = t.sigh;
884 ((long *)&tll)[1] = t.sigh; 779 if ((precision_loss == 1) ||
885 if ( (precision_loss == 1) || 780 ((t.sigh & 0x80000000) &&
886 ((t.sigh & 0x80000000) && 781 !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) {
887 !((t.sigh == 0x80000000) && (t.sigl == 0) && 782 EXCEPTION(EX_Invalid);
888 signnegative(&t))) ) 783 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
889 { 784 invalid_operand:
890 EXCEPTION(EX_Invalid); 785 if (control_word & EX_Invalid) {
891 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 786 /* Produce something like QNaN "indefinite" */
892 invalid_operand: 787 tll = 0x8000000000000000LL;
893 if ( control_word & EX_Invalid ) 788 } else
894 { 789 return 0;
895 /* Produce something like QNaN "indefinite" */ 790 } else {
896 tll = 0x8000000000000000LL; 791 if (precision_loss)
792 set_precision_flag(precision_loss);
793 if (signnegative(&t))
794 tll = -tll;
897 } 795 }
898 else
899 return 0;
900 }
901 else
902 {
903 if ( precision_loss )
904 set_precision_flag(precision_loss);
905 if ( signnegative(&t) )
906 tll = - tll;
907 }
908
909 RE_ENTRANT_CHECK_OFF;
910 FPU_access_ok(VERIFY_WRITE,d,8);
911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
913 RE_ENTRANT_CHECK_ON;
914
915 return 1;
916}
917 796
797 RE_ENTRANT_CHECK_OFF;
798 FPU_access_ok(VERIFY_WRITE, d, 8);
799 if (copy_to_user(d, &tll, 8))
800 FPU_abort;
801 RE_ENTRANT_CHECK_ON;
802
803 return 1;
804}
918 805
919/* Put a long into user memory */ 806/* Put a long into user memory */
920int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) 807int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
921{ 808{
922 FPU_REG t; 809 FPU_REG t;
923 int precision_loss; 810 int precision_loss;
924 811
925 if ( st0_tag == TAG_Empty ) 812 if (st0_tag == TAG_Empty) {
926 { 813 /* Empty register (stack underflow) */
927 /* Empty register (stack underflow) */ 814 EXCEPTION(EX_StackUnder);
928 EXCEPTION(EX_StackUnder); 815 goto invalid_operand;
929 goto invalid_operand; 816 } else if (st0_tag == TAG_Special) {
930 } 817 st0_tag = FPU_Special(st0_ptr);
931 else if ( st0_tag == TAG_Special ) 818 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
932 { 819 EXCEPTION(EX_Invalid);
933 st0_tag = FPU_Special(st0_ptr); 820 goto invalid_operand;
934 if ( (st0_tag == TW_Infinity) || 821 }
935 (st0_tag == TW_NaN) )
936 {
937 EXCEPTION(EX_Invalid);
938 goto invalid_operand;
939 } 822 }
940 } 823
941 824 reg_copy(st0_ptr, &t);
942 reg_copy(st0_ptr, &t); 825 precision_loss = FPU_round_to_int(&t, st0_tag);
943 precision_loss = FPU_round_to_int(&t, st0_tag); 826 if (t.sigh ||
944 if (t.sigh || 827 ((t.sigl & 0x80000000) &&
945 ((t.sigl & 0x80000000) && 828 !((t.sigl == 0x80000000) && signnegative(&t)))) {
946 !((t.sigl == 0x80000000) && signnegative(&t))) ) 829 EXCEPTION(EX_Invalid);
947 { 830 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
948 EXCEPTION(EX_Invalid); 831 invalid_operand:
949 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 832 if (control_word & EX_Invalid) {
950 invalid_operand: 833 /* Produce something like QNaN "indefinite" */
951 if ( control_word & EX_Invalid ) 834 t.sigl = 0x80000000;
952 { 835 } else
953 /* Produce something like QNaN "indefinite" */ 836 return 0;
954 t.sigl = 0x80000000; 837 } else {
838 if (precision_loss)
839 set_precision_flag(precision_loss);
840 if (signnegative(&t))
841 t.sigl = -(long)t.sigl;
955 } 842 }
956 else
957 return 0;
958 }
959 else
960 {
961 if ( precision_loss )
962 set_precision_flag(precision_loss);
963 if ( signnegative(&t) )
964 t.sigl = -(long)t.sigl;
965 }
966
967 RE_ENTRANT_CHECK_OFF;
968 FPU_access_ok(VERIFY_WRITE,d,4);
969 FPU_put_user(t.sigl, (unsigned long __user *) d);
970 RE_ENTRANT_CHECK_ON;
971
972 return 1;
973}
974 843
844 RE_ENTRANT_CHECK_OFF;
845 FPU_access_ok(VERIFY_WRITE, d, 4);
846 FPU_put_user(t.sigl, (unsigned long __user *)d);
847 RE_ENTRANT_CHECK_ON;
848
849 return 1;
850}
975 851
976/* Put a short into user memory */ 852/* Put a short into user memory */
977int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) 853int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
978{ 854{
979 FPU_REG t; 855 FPU_REG t;
980 int precision_loss; 856 int precision_loss;
981 857
982 if ( st0_tag == TAG_Empty ) 858 if (st0_tag == TAG_Empty) {
983 { 859 /* Empty register (stack underflow) */
984 /* Empty register (stack underflow) */ 860 EXCEPTION(EX_StackUnder);
985 EXCEPTION(EX_StackUnder); 861 goto invalid_operand;
986 goto invalid_operand; 862 } else if (st0_tag == TAG_Special) {
987 } 863 st0_tag = FPU_Special(st0_ptr);
988 else if ( st0_tag == TAG_Special ) 864 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
989 { 865 EXCEPTION(EX_Invalid);
990 st0_tag = FPU_Special(st0_ptr); 866 goto invalid_operand;
991 if ( (st0_tag == TW_Infinity) || 867 }
992 (st0_tag == TW_NaN) )
993 {
994 EXCEPTION(EX_Invalid);
995 goto invalid_operand;
996 } 868 }
997 } 869
998 870 reg_copy(st0_ptr, &t);
999 reg_copy(st0_ptr, &t); 871 precision_loss = FPU_round_to_int(&t, st0_tag);
1000 precision_loss = FPU_round_to_int(&t, st0_tag); 872 if (t.sigh ||
1001 if (t.sigh || 873 ((t.sigl & 0xffff8000) &&
1002 ((t.sigl & 0xffff8000) && 874 !((t.sigl == 0x8000) && signnegative(&t)))) {
1003 !((t.sigl == 0x8000) && signnegative(&t))) ) 875 EXCEPTION(EX_Invalid);
1004 { 876 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1005 EXCEPTION(EX_Invalid); 877 invalid_operand:
1006 /* This is a special case: see sec 16.2.5.1 of the 80486 book */ 878 if (control_word & EX_Invalid) {
1007 invalid_operand: 879 /* Produce something like QNaN "indefinite" */
1008 if ( control_word & EX_Invalid ) 880 t.sigl = 0x8000;
1009 { 881 } else
1010 /* Produce something like QNaN "indefinite" */ 882 return 0;
1011 t.sigl = 0x8000; 883 } else {
884 if (precision_loss)
885 set_precision_flag(precision_loss);
886 if (signnegative(&t))
887 t.sigl = -t.sigl;
1012 } 888 }
1013 else
1014 return 0;
1015 }
1016 else
1017 {
1018 if ( precision_loss )
1019 set_precision_flag(precision_loss);
1020 if ( signnegative(&t) )
1021 t.sigl = -t.sigl;
1022 }
1023
1024 RE_ENTRANT_CHECK_OFF;
1025 FPU_access_ok(VERIFY_WRITE,d,2);
1026 FPU_put_user((short)t.sigl, d);
1027 RE_ENTRANT_CHECK_ON;
1028
1029 return 1;
1030}
1031 889
890 RE_ENTRANT_CHECK_OFF;
891 FPU_access_ok(VERIFY_WRITE, d, 2);
892 FPU_put_user((short)t.sigl, d);
893 RE_ENTRANT_CHECK_ON;
894
895 return 1;
896}
1032 897
1033/* Put a packed bcd array into user memory */ 898/* Put a packed bcd array into user memory */
1034int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) 899int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
1035{ 900{
1036 FPU_REG t; 901 FPU_REG t;
1037 unsigned long long ll; 902 unsigned long long ll;
1038 u_char b; 903 u_char b;
1039 int i, precision_loss; 904 int i, precision_loss;
1040 u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; 905 u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
1041 906
1042 if ( st0_tag == TAG_Empty ) 907 if (st0_tag == TAG_Empty) {
1043 { 908 /* Empty register (stack underflow) */
1044 /* Empty register (stack underflow) */ 909 EXCEPTION(EX_StackUnder);
1045 EXCEPTION(EX_StackUnder); 910 goto invalid_operand;
1046 goto invalid_operand; 911 } else if (st0_tag == TAG_Special) {
1047 } 912 st0_tag = FPU_Special(st0_ptr);
1048 else if ( st0_tag == TAG_Special ) 913 if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
1049 { 914 EXCEPTION(EX_Invalid);
1050 st0_tag = FPU_Special(st0_ptr); 915 goto invalid_operand;
1051 if ( (st0_tag == TW_Infinity) || 916 }
1052 (st0_tag == TW_NaN) ) 917 }
1053 { 918
1054 EXCEPTION(EX_Invalid); 919 reg_copy(st0_ptr, &t);
1055 goto invalid_operand; 920 precision_loss = FPU_round_to_int(&t, st0_tag);
921 ll = significand(&t);
922
923 /* Check for overflow, by comparing with 999999999999999999 decimal. */
924 if ((t.sigh > 0x0de0b6b3) ||
925 ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) {
926 EXCEPTION(EX_Invalid);
927 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
928 invalid_operand:
929 if (control_word & CW_Invalid) {
930 /* Produce the QNaN "indefinite" */
931 RE_ENTRANT_CHECK_OFF;
932 FPU_access_ok(VERIFY_WRITE, d, 10);
933 for (i = 0; i < 7; i++)
934 FPU_put_user(0, d + i); /* These bytes "undefined" */
935 FPU_put_user(0xc0, d + 7); /* This byte "undefined" */
936 FPU_put_user(0xff, d + 8);
937 FPU_put_user(0xff, d + 9);
938 RE_ENTRANT_CHECK_ON;
939 return 1;
940 } else
941 return 0;
942 } else if (precision_loss) {
943 /* Precision loss doesn't stop the data transfer */
944 set_precision_flag(precision_loss);
1056 } 945 }
1057 } 946
1058 947 RE_ENTRANT_CHECK_OFF;
1059 reg_copy(st0_ptr, &t); 948 FPU_access_ok(VERIFY_WRITE, d, 10);
1060 precision_loss = FPU_round_to_int(&t, st0_tag); 949 RE_ENTRANT_CHECK_ON;
1061 ll = significand(&t); 950 for (i = 0; i < 9; i++) {
1062 951 b = FPU_div_small(&ll, 10);
1063 /* Check for overflow, by comparing with 999999999999999999 decimal. */ 952 b |= (FPU_div_small(&ll, 10)) << 4;
1064 if ( (t.sigh > 0x0de0b6b3) || 953 RE_ENTRANT_CHECK_OFF;
1065 ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) ) 954 FPU_put_user(b, d + i);
1066 { 955 RE_ENTRANT_CHECK_ON;
1067 EXCEPTION(EX_Invalid);
1068 /* This is a special case: see sec 16.2.5.1 of the 80486 book */
1069 invalid_operand:
1070 if ( control_word & CW_Invalid )
1071 {
1072 /* Produce the QNaN "indefinite" */
1073 RE_ENTRANT_CHECK_OFF;
1074 FPU_access_ok(VERIFY_WRITE,d,10);
1075 for ( i = 0; i < 7; i++)
1076 FPU_put_user(0, d+i); /* These bytes "undefined" */
1077 FPU_put_user(0xc0, d+7); /* This byte "undefined" */
1078 FPU_put_user(0xff, d+8);
1079 FPU_put_user(0xff, d+9);
1080 RE_ENTRANT_CHECK_ON;
1081 return 1;
1082 } 956 }
1083 else 957 RE_ENTRANT_CHECK_OFF;
1084 return 0; 958 FPU_put_user(sign, d + 9);
1085 } 959 RE_ENTRANT_CHECK_ON;
1086 else if ( precision_loss ) 960
1087 { 961 return 1;
1088 /* Precision loss doesn't stop the data transfer */
1089 set_precision_flag(precision_loss);
1090 }
1091
1092 RE_ENTRANT_CHECK_OFF;
1093 FPU_access_ok(VERIFY_WRITE,d,10);
1094 RE_ENTRANT_CHECK_ON;
1095 for ( i = 0; i < 9; i++)
1096 {
1097 b = FPU_div_small(&ll, 10);
1098 b |= (FPU_div_small(&ll, 10)) << 4;
1099 RE_ENTRANT_CHECK_OFF;
1100 FPU_put_user(b, d+i);
1101 RE_ENTRANT_CHECK_ON;
1102 }
1103 RE_ENTRANT_CHECK_OFF;
1104 FPU_put_user(sign, d+9);
1105 RE_ENTRANT_CHECK_ON;
1106
1107 return 1;
1108} 962}
1109 963
1110/*===========================================================================*/ 964/*===========================================================================*/
@@ -1119,59 +973,56 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
1119 largest possible value */ 973 largest possible value */
1120int FPU_round_to_int(FPU_REG *r, u_char tag) 974int FPU_round_to_int(FPU_REG *r, u_char tag)
1121{ 975{
1122 u_char very_big; 976 u_char very_big;
1123 unsigned eax; 977 unsigned eax;
1124 978
1125 if (tag == TAG_Zero) 979 if (tag == TAG_Zero) {
1126 { 980 /* Make sure that zero is returned */
1127 /* Make sure that zero is returned */ 981 significand(r) = 0;
1128 significand(r) = 0; 982 return 0; /* o.k. */
1129 return 0; /* o.k. */ 983 }
1130 } 984
1131 985 if (exponent(r) > 63) {
1132 if (exponent(r) > 63) 986 r->sigl = r->sigh = ~0; /* The largest representable number */
1133 { 987 return 1; /* overflow */
1134 r->sigl = r->sigh = ~0; /* The largest representable number */ 988 }
1135 return 1; /* overflow */ 989
1136 } 990 eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
1137 991 very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
1138 eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
1139 very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
1140#define half_or_more (eax & 0x80000000) 992#define half_or_more (eax & 0x80000000)
1141#define frac_part (eax) 993#define frac_part (eax)
1142#define more_than_half ((eax & 0x80000001) == 0x80000001) 994#define more_than_half ((eax & 0x80000001) == 0x80000001)
1143 switch (control_word & CW_RC) 995 switch (control_word & CW_RC) {
1144 { 996 case RC_RND:
1145 case RC_RND: 997 if (more_than_half /* nearest */
1146 if ( more_than_half /* nearest */ 998 || (half_or_more && (r->sigl & 1))) { /* odd -> even */
1147 || (half_or_more && (r->sigl & 1)) ) /* odd -> even */ 999 if (very_big)
1148 { 1000 return 1; /* overflow */
1149 if ( very_big ) return 1; /* overflow */ 1001 significand(r)++;
1150 significand(r) ++; 1002 return PRECISION_LOST_UP;
1151 return PRECISION_LOST_UP; 1003 }
1152 } 1004 break;
1153 break; 1005 case RC_DOWN:
1154 case RC_DOWN: 1006 if (frac_part && getsign(r)) {
1155 if (frac_part && getsign(r)) 1007 if (very_big)
1156 { 1008 return 1; /* overflow */
1157 if ( very_big ) return 1; /* overflow */ 1009 significand(r)++;
1158 significand(r) ++; 1010 return PRECISION_LOST_UP;
1159 return PRECISION_LOST_UP; 1011 }
1160 } 1012 break;
1161 break; 1013 case RC_UP:
1162 case RC_UP: 1014 if (frac_part && !getsign(r)) {
1163 if (frac_part && !getsign(r)) 1015 if (very_big)
1164 { 1016 return 1; /* overflow */
1165 if ( very_big ) return 1; /* overflow */ 1017 significand(r)++;
1166 significand(r) ++; 1018 return PRECISION_LOST_UP;
1167 return PRECISION_LOST_UP; 1019 }
1020 break;
1021 case RC_CHOP:
1022 break;
1168 } 1023 }
1169 break;
1170 case RC_CHOP:
1171 break;
1172 }
1173 1024
1174 return eax ? PRECISION_LOST_DOWN : 0; 1025 return eax ? PRECISION_LOST_DOWN : 0;
1175 1026
1176} 1027}
1177 1028
@@ -1179,197 +1030,195 @@ int FPU_round_to_int(FPU_REG *r, u_char tag)
1179 1030
1180u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) 1031u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
1181{ 1032{
1182 unsigned short tag_word = 0; 1033 unsigned short tag_word = 0;
1183 u_char tag; 1034 u_char tag;
1184 int i; 1035 int i;
1185 1036
1186 if ( (addr_modes.default_mode == VM86) || 1037 if ((addr_modes.default_mode == VM86) ||
1187 ((addr_modes.default_mode == PM16) 1038 ((addr_modes.default_mode == PM16)
1188 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) 1039 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
1189 { 1040 RE_ENTRANT_CHECK_OFF;
1190 RE_ENTRANT_CHECK_OFF; 1041 FPU_access_ok(VERIFY_READ, s, 0x0e);
1191 FPU_access_ok(VERIFY_READ, s, 0x0e); 1042 FPU_get_user(control_word, (unsigned short __user *)s);
1192 FPU_get_user(control_word, (unsigned short __user *) s); 1043 FPU_get_user(partial_status, (unsigned short __user *)(s + 2));
1193 FPU_get_user(partial_status, (unsigned short __user *) (s+2)); 1044 FPU_get_user(tag_word, (unsigned short __user *)(s + 4));
1194 FPU_get_user(tag_word, (unsigned short __user *) (s+4)); 1045 FPU_get_user(instruction_address.offset,
1195 FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6)); 1046 (unsigned short __user *)(s + 6));
1196 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8)); 1047 FPU_get_user(instruction_address.selector,
1197 FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a)); 1048 (unsigned short __user *)(s + 8));
1198 FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c)); 1049 FPU_get_user(operand_address.offset,
1199 RE_ENTRANT_CHECK_ON; 1050 (unsigned short __user *)(s + 0x0a));
1200 s += 0x0e; 1051 FPU_get_user(operand_address.selector,
1201 if ( addr_modes.default_mode == VM86 ) 1052 (unsigned short __user *)(s + 0x0c));
1202 { 1053 RE_ENTRANT_CHECK_ON;
1203 instruction_address.offset 1054 s += 0x0e;
1204 += (instruction_address.selector & 0xf000) << 4; 1055 if (addr_modes.default_mode == VM86) {
1205 operand_address.offset += (operand_address.selector & 0xf000) << 4; 1056 instruction_address.offset
1057 += (instruction_address.selector & 0xf000) << 4;
1058 operand_address.offset +=
1059 (operand_address.selector & 0xf000) << 4;
1060 }
1061 } else {
1062 RE_ENTRANT_CHECK_OFF;
1063 FPU_access_ok(VERIFY_READ, s, 0x1c);
1064 FPU_get_user(control_word, (unsigned short __user *)s);
1065 FPU_get_user(partial_status, (unsigned short __user *)(s + 4));
1066 FPU_get_user(tag_word, (unsigned short __user *)(s + 8));
1067 FPU_get_user(instruction_address.offset,
1068 (unsigned long __user *)(s + 0x0c));
1069 FPU_get_user(instruction_address.selector,
1070 (unsigned short __user *)(s + 0x10));
1071 FPU_get_user(instruction_address.opcode,
1072 (unsigned short __user *)(s + 0x12));
1073 FPU_get_user(operand_address.offset,
1074 (unsigned long __user *)(s + 0x14));
1075 FPU_get_user(operand_address.selector,
1076 (unsigned long __user *)(s + 0x18));
1077 RE_ENTRANT_CHECK_ON;
1078 s += 0x1c;
1206 } 1079 }
1207 }
1208 else
1209 {
1210 RE_ENTRANT_CHECK_OFF;
1211 FPU_access_ok(VERIFY_READ, s, 0x1c);
1212 FPU_get_user(control_word, (unsigned short __user *) s);
1213 FPU_get_user(partial_status, (unsigned short __user *) (s+4));
1214 FPU_get_user(tag_word, (unsigned short __user *) (s+8));
1215 FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c));
1216 FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10));
1217 FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12));
1218 FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14));
1219 FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18));
1220 RE_ENTRANT_CHECK_ON;
1221 s += 0x1c;
1222 }
1223 1080
1224#ifdef PECULIAR_486 1081#ifdef PECULIAR_486
1225 control_word &= ~0xe080; 1082 control_word &= ~0xe080;
1226#endif /* PECULIAR_486 */ 1083#endif /* PECULIAR_486 */
1227 1084
1228 top = (partial_status >> SW_Top_Shift) & 7; 1085 top = (partial_status >> SW_Top_Shift) & 7;
1229 1086
1230 if ( partial_status & ~control_word & CW_Exceptions ) 1087 if (partial_status & ~control_word & CW_Exceptions)
1231 partial_status |= (SW_Summary | SW_Backward); 1088 partial_status |= (SW_Summary | SW_Backward);
1232 else 1089 else
1233 partial_status &= ~(SW_Summary | SW_Backward); 1090 partial_status &= ~(SW_Summary | SW_Backward);
1234 1091
1235 for ( i = 0; i < 8; i++ ) 1092 for (i = 0; i < 8; i++) {
1236 { 1093 tag = tag_word & 3;
1237 tag = tag_word & 3; 1094 tag_word >>= 2;
1238 tag_word >>= 2; 1095
1239 1096 if (tag == TAG_Empty)
1240 if ( tag == TAG_Empty ) 1097 /* New tag is empty. Accept it */
1241 /* New tag is empty. Accept it */ 1098 FPU_settag(i, TAG_Empty);
1242 FPU_settag(i, TAG_Empty); 1099 else if (FPU_gettag(i) == TAG_Empty) {
1243 else if ( FPU_gettag(i) == TAG_Empty ) 1100 /* Old tag is empty and new tag is not empty. New tag is determined
1244 { 1101 by old reg contents */
1245 /* Old tag is empty and new tag is not empty. New tag is determined 1102 if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) {
1246 by old reg contents */ 1103 if (!
1247 if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias ) 1104 (fpu_register(i).sigl | fpu_register(i).
1248 { 1105 sigh))
1249 if ( !(fpu_register(i).sigl | fpu_register(i).sigh) ) 1106 FPU_settag(i, TAG_Zero);
1250 FPU_settag(i, TAG_Zero); 1107 else
1251 else 1108 FPU_settag(i, TAG_Special);
1252 FPU_settag(i, TAG_Special); 1109 } else if (exponent(&fpu_register(i)) ==
1253 } 1110 0x7fff - EXTENDED_Ebias) {
1254 else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias ) 1111 FPU_settag(i, TAG_Special);
1255 { 1112 } else if (fpu_register(i).sigh & 0x80000000)
1256 FPU_settag(i, TAG_Special); 1113 FPU_settag(i, TAG_Valid);
1257 } 1114 else
1258 else if ( fpu_register(i).sigh & 0x80000000 ) 1115 FPU_settag(i, TAG_Special); /* An Un-normal */
1259 FPU_settag(i, TAG_Valid); 1116 }
1260 else 1117 /* Else old tag is not empty and new tag is not empty. Old tag
1261 FPU_settag(i, TAG_Special); /* An Un-normal */ 1118 remains correct */
1262 } 1119 }
1263 /* Else old tag is not empty and new tag is not empty. Old tag
1264 remains correct */
1265 }
1266
1267 return s;
1268}
1269 1120
1121 return s;
1122}
1270 1123
1271void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) 1124void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
1272{ 1125{
1273 int i, regnr; 1126 int i, regnr;
1274 u_char __user *s = fldenv(addr_modes, data_address); 1127 u_char __user *s = fldenv(addr_modes, data_address);
1275 int offset = (top & 7) * 10, other = 80 - offset; 1128 int offset = (top & 7) * 10, other = 80 - offset;
1276 1129
1277 /* Copy all registers in stack order. */ 1130 /* Copy all registers in stack order. */
1278 RE_ENTRANT_CHECK_OFF; 1131 RE_ENTRANT_CHECK_OFF;
1279 FPU_access_ok(VERIFY_READ,s,80); 1132 FPU_access_ok(VERIFY_READ, s, 80);
1280 __copy_from_user(register_base+offset, s, other); 1133 __copy_from_user(register_base + offset, s, other);
1281 if ( offset ) 1134 if (offset)
1282 __copy_from_user(register_base, s+other, offset); 1135 __copy_from_user(register_base, s + other, offset);
1283 RE_ENTRANT_CHECK_ON; 1136 RE_ENTRANT_CHECK_ON;
1284 1137
1285 for ( i = 0; i < 8; i++ ) 1138 for (i = 0; i < 8; i++) {
1286 { 1139 regnr = (i + top) & 7;
1287 regnr = (i+top) & 7; 1140 if (FPU_gettag(regnr) != TAG_Empty)
1288 if ( FPU_gettag(regnr) != TAG_Empty ) 1141 /* The loaded data over-rides all other cases. */
1289 /* The loaded data over-rides all other cases. */ 1142 FPU_settag(regnr, FPU_tagof(&st(i)));
1290 FPU_settag(regnr, FPU_tagof(&st(i))); 1143 }
1291 }
1292 1144
1293} 1145}
1294 1146
1295
1296u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) 1147u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1297{ 1148{
1298 if ( (addr_modes.default_mode == VM86) || 1149 if ((addr_modes.default_mode == VM86) ||
1299 ((addr_modes.default_mode == PM16) 1150 ((addr_modes.default_mode == PM16)
1300 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) 1151 ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
1301 { 1152 RE_ENTRANT_CHECK_OFF;
1302 RE_ENTRANT_CHECK_OFF; 1153 FPU_access_ok(VERIFY_WRITE, d, 14);
1303 FPU_access_ok(VERIFY_WRITE,d,14);
1304#ifdef PECULIAR_486 1154#ifdef PECULIAR_486
1305 FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d); 1155 FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d);
1306#else 1156#else
1307 FPU_put_user(control_word, (unsigned short __user *) d); 1157 FPU_put_user(control_word, (unsigned short __user *)d);
1308#endif /* PECULIAR_486 */ 1158#endif /* PECULIAR_486 */
1309 FPU_put_user(status_word(), (unsigned short __user *) (d+2)); 1159 FPU_put_user(status_word(), (unsigned short __user *)(d + 2));
1310 FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4)); 1160 FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4));
1311 FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6)); 1161 FPU_put_user(instruction_address.offset,
1312 FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a)); 1162 (unsigned short __user *)(d + 6));
1313 if ( addr_modes.default_mode == VM86 ) 1163 FPU_put_user(operand_address.offset,
1314 { 1164 (unsigned short __user *)(d + 0x0a));
1315 FPU_put_user((instruction_address.offset & 0xf0000) >> 4, 1165 if (addr_modes.default_mode == VM86) {
1316 (unsigned short __user *) (d+8)); 1166 FPU_put_user((instruction_address.
1317 FPU_put_user((operand_address.offset & 0xf0000) >> 4, 1167 offset & 0xf0000) >> 4,
1318 (unsigned short __user *) (d+0x0c)); 1168 (unsigned short __user *)(d + 8));
1319 } 1169 FPU_put_user((operand_address.offset & 0xf0000) >> 4,
1320 else 1170 (unsigned short __user *)(d + 0x0c));
1321 { 1171 } else {
1322 FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8)); 1172 FPU_put_user(instruction_address.selector,
1323 FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c)); 1173 (unsigned short __user *)(d + 8));
1324 } 1174 FPU_put_user(operand_address.selector,
1325 RE_ENTRANT_CHECK_ON; 1175 (unsigned short __user *)(d + 0x0c));
1326 d += 0x0e; 1176 }
1327 } 1177 RE_ENTRANT_CHECK_ON;
1328 else 1178 d += 0x0e;
1329 { 1179 } else {
1330 RE_ENTRANT_CHECK_OFF; 1180 RE_ENTRANT_CHECK_OFF;
1331 FPU_access_ok(VERIFY_WRITE, d, 7*4); 1181 FPU_access_ok(VERIFY_WRITE, d, 7 * 4);
1332#ifdef PECULIAR_486 1182#ifdef PECULIAR_486
1333 control_word &= ~0xe080; 1183 control_word &= ~0xe080;
1334 /* An 80486 sets nearly all of the reserved bits to 1. */ 1184 /* An 80486 sets nearly all of the reserved bits to 1. */
1335 control_word |= 0xffff0040; 1185 control_word |= 0xffff0040;
1336 partial_status = status_word() | 0xffff0000; 1186 partial_status = status_word() | 0xffff0000;
1337 fpu_tag_word |= 0xffff0000; 1187 fpu_tag_word |= 0xffff0000;
1338 I387.soft.fcs &= ~0xf8000000; 1188 I387.soft.fcs &= ~0xf8000000;
1339 I387.soft.fos |= 0xffff0000; 1189 I387.soft.fos |= 0xffff0000;
1340#endif /* PECULIAR_486 */ 1190#endif /* PECULIAR_486 */
1341 if (__copy_to_user(d, &control_word, 7*4)) 1191 if (__copy_to_user(d, &control_word, 7 * 4))
1342 FPU_abort; 1192 FPU_abort;
1343 RE_ENTRANT_CHECK_ON; 1193 RE_ENTRANT_CHECK_ON;
1344 d += 0x1c; 1194 d += 0x1c;
1345 } 1195 }
1346
1347 control_word |= CW_Exceptions;
1348 partial_status &= ~(SW_Summary | SW_Backward);
1349
1350 return d;
1351}
1352 1196
1197 control_word |= CW_Exceptions;
1198 partial_status &= ~(SW_Summary | SW_Backward);
1199
1200 return d;
1201}
1353 1202
1354void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) 1203void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1355{ 1204{
1356 u_char __user *d; 1205 u_char __user *d;
1357 int offset = (top & 7) * 10, other = 80 - offset; 1206 int offset = (top & 7) * 10, other = 80 - offset;
1358 1207
1359 d = fstenv(addr_modes, data_address); 1208 d = fstenv(addr_modes, data_address);
1360 1209
1361 RE_ENTRANT_CHECK_OFF; 1210 RE_ENTRANT_CHECK_OFF;
1362 FPU_access_ok(VERIFY_WRITE,d,80); 1211 FPU_access_ok(VERIFY_WRITE, d, 80);
1363 1212
1364 /* Copy all registers in stack order. */ 1213 /* Copy all registers in stack order. */
1365 if (__copy_to_user(d, register_base+offset, other)) 1214 if (__copy_to_user(d, register_base + offset, other))
1366 FPU_abort; 1215 FPU_abort;
1367 if ( offset ) 1216 if (offset)
1368 if (__copy_to_user(d+other, register_base, offset)) 1217 if (__copy_to_user(d + other, register_base, offset))
1369 FPU_abort; 1218 FPU_abort;
1370 RE_ENTRANT_CHECK_ON; 1219 RE_ENTRANT_CHECK_ON;
1371 1220
1372 finit(); 1221 finit();
1373} 1222}
1374 1223
1375/*===========================================================================*/ 1224/*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c
index 40f50b61bc67..36c37f71f713 100644
--- a/arch/x86/math-emu/reg_mul.c
+++ b/arch/x86/math-emu/reg_mul.c
@@ -20,7 +20,6 @@
20#include "reg_constant.h" 20#include "reg_constant.h"
21#include "fpu_system.h" 21#include "fpu_system.h"
22 22
23
24/* 23/*
25 Multiply two registers to give a register result. 24 Multiply two registers to give a register result.
26 The sources are st(deststnr) and (b,tagb,signb). 25 The sources are st(deststnr) and (b,tagb,signb).
@@ -29,104 +28,88 @@
29/* This routine must be called with non-empty source registers */ 28/* This routine must be called with non-empty source registers */
30int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w) 29int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
31{ 30{
32 FPU_REG *a = &st(deststnr); 31 FPU_REG *a = &st(deststnr);
33 FPU_REG *dest = a; 32 FPU_REG *dest = a;
34 u_char taga = FPU_gettagi(deststnr); 33 u_char taga = FPU_gettagi(deststnr);
35 u_char saved_sign = getsign(dest); 34 u_char saved_sign = getsign(dest);
36 u_char sign = (getsign(a) ^ getsign(b)); 35 u_char sign = (getsign(a) ^ getsign(b));
37 int tag; 36 int tag;
38
39 37
40 if ( !(taga | tagb) ) 38 if (!(taga | tagb)) {
41 { 39 /* Both regs Valid, this should be the most common case. */
42 /* Both regs Valid, this should be the most common case. */
43 40
44 tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b)); 41 tag =
45 if ( tag < 0 ) 42 FPU_u_mul(a, b, dest, control_w, sign,
46 { 43 exponent(a) + exponent(b));
47 setsign(dest, saved_sign); 44 if (tag < 0) {
48 return tag; 45 setsign(dest, saved_sign);
46 return tag;
47 }
48 FPU_settagi(deststnr, tag);
49 return tag;
49 } 50 }
50 FPU_settagi(deststnr, tag);
51 return tag;
52 }
53 51
54 if ( taga == TAG_Special ) 52 if (taga == TAG_Special)
55 taga = FPU_Special(a); 53 taga = FPU_Special(a);
56 if ( tagb == TAG_Special ) 54 if (tagb == TAG_Special)
57 tagb = FPU_Special(b); 55 tagb = FPU_Special(b);
58 56
59 if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) 57 if (((taga == TAG_Valid) && (tagb == TW_Denormal))
60 || ((taga == TW_Denormal) && (tagb == TAG_Valid)) 58 || ((taga == TW_Denormal) && (tagb == TAG_Valid))
61 || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) 59 || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
62 { 60 FPU_REG x, y;
63 FPU_REG x, y; 61 if (denormal_operand() < 0)
64 if ( denormal_operand() < 0 ) 62 return FPU_Exception;
65 return FPU_Exception;
66
67 FPU_to_exp16(a, &x);
68 FPU_to_exp16(b, &y);
69 tag = FPU_u_mul(&x, &y, dest, control_w, sign,
70 exponent16(&x) + exponent16(&y));
71 if ( tag < 0 )
72 {
73 setsign(dest, saved_sign);
74 return tag;
75 }
76 FPU_settagi(deststnr, tag);
77 return tag;
78 }
79 else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
80 {
81 if ( ((tagb == TW_Denormal) || (taga == TW_Denormal))
82 && (denormal_operand() < 0) )
83 return FPU_Exception;
84 63
85 /* Must have either both arguments == zero, or 64 FPU_to_exp16(a, &x);
86 one valid and the other zero. 65 FPU_to_exp16(b, &y);
87 The result is therefore zero. */ 66 tag = FPU_u_mul(&x, &y, dest, control_w, sign,
88 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); 67 exponent16(&x) + exponent16(&y));
89 /* The 80486 book says that the answer is +0, but a real 68 if (tag < 0) {
90 80486 behaves this way. 69 setsign(dest, saved_sign);
91 IEEE-754 apparently says it should be this way. */ 70 return tag;
92 setsign(dest, sign); 71 }
93 return TAG_Zero; 72 FPU_settagi(deststnr, tag);
94 } 73 return tag;
95 /* Must have infinities, NaNs, etc */ 74 } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
96 else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) 75 if (((tagb == TW_Denormal) || (taga == TW_Denormal))
97 { 76 && (denormal_operand() < 0))
98 return real_2op_NaN(b, tagb, deststnr, &st(0)); 77 return FPU_Exception;
99 }
100 else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero))
101 || ((tagb == TW_Infinity) && (taga == TAG_Zero)) )
102 {
103 return arith_invalid(deststnr); /* Zero*Infinity is invalid */
104 }
105 else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
106 && (denormal_operand() < 0) )
107 {
108 return FPU_Exception;
109 }
110 else if (taga == TW_Infinity)
111 {
112 FPU_copy_to_regi(a, TAG_Special, deststnr);
113 setsign(dest, sign);
114 return TAG_Special;
115 }
116 else if (tagb == TW_Infinity)
117 {
118 FPU_copy_to_regi(b, TAG_Special, deststnr);
119 setsign(dest, sign);
120 return TAG_Special;
121 }
122 78
79 /* Must have either both arguments == zero, or
80 one valid and the other zero.
81 The result is therefore zero. */
82 FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
83 /* The 80486 book says that the answer is +0, but a real
84 80486 behaves this way.
85 IEEE-754 apparently says it should be this way. */
86 setsign(dest, sign);
87 return TAG_Zero;
88 }
89 /* Must have infinities, NaNs, etc */
90 else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
91 return real_2op_NaN(b, tagb, deststnr, &st(0));
92 } else if (((taga == TW_Infinity) && (tagb == TAG_Zero))
93 || ((tagb == TW_Infinity) && (taga == TAG_Zero))) {
94 return arith_invalid(deststnr); /* Zero*Infinity is invalid */
95 } else if (((taga == TW_Denormal) || (tagb == TW_Denormal))
96 && (denormal_operand() < 0)) {
97 return FPU_Exception;
98 } else if (taga == TW_Infinity) {
99 FPU_copy_to_regi(a, TAG_Special, deststnr);
100 setsign(dest, sign);
101 return TAG_Special;
102 } else if (tagb == TW_Infinity) {
103 FPU_copy_to_regi(b, TAG_Special, deststnr);
104 setsign(dest, sign);
105 return TAG_Special;
106 }
123#ifdef PARANOID 107#ifdef PARANOID
124 else 108 else {
125 { 109 EXCEPTION(EX_INTERNAL | 0x102);
126 EXCEPTION(EX_INTERNAL|0x102); 110 return FPU_Exception;
127 return FPU_Exception; 111 }
128 } 112#endif /* PARANOID */
129#endif /* PARANOID */
130 113
131 return 0; 114 return 0;
132} 115}
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
index 59e73302aa60..54a3f226982d 100644
--- a/arch/x86/math-emu/status_w.h
+++ b/arch/x86/math-emu/status_w.h
@@ -10,7 +10,7 @@
10#ifndef _STATUS_H_ 10#ifndef _STATUS_H_
11#define _STATUS_H_ 11#define _STATUS_H_
12 12
13#include "fpu_emu.h" /* for definition of PECULIAR_486 */ 13#include "fpu_emu.h" /* for definition of PECULIAR_486 */
14 14
15#ifdef __ASSEMBLY__ 15#ifdef __ASSEMBLY__
16#define Const__(x) $##x 16#define Const__(x) $##x
@@ -34,7 +34,7 @@
34#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */ 34#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */
35#define SW_Invalid Const__(0x0001) /* invalid operation */ 35#define SW_Invalid Const__(0x0001) /* invalid operation */
36 36
37#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ 37#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */
38 38
39#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
40 40
@@ -50,8 +50,8 @@
50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) 50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
51static inline void setcc(int cc) 51static inline void setcc(int cc)
52{ 52{
53 partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); 53 partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3);
54 partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); 54 partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3);
55} 55}
56 56
57#ifdef PECULIAR_486 57#ifdef PECULIAR_486
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index 362b4ad082de..c36ae88bb543 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,9 +2,8 @@
2# Makefile for the linux i386-specific parts of the memory manager. 2# Makefile for the linux i386-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o 5obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
6 6
7obj-$(CONFIG_NUMA) += discontig_32.o 7obj-$(CONFIG_NUMA) += discontig_32.o
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
index 6bcb47945b87..688c8c28ac8f 100644
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,9 +2,8 @@
2# Makefile for the linux x86_64-specific parts of the memory manager. 2# Makefile for the linux x86_64-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o 5obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_NUMA) += numa_64.o 7obj-$(CONFIG_NUMA) += numa_64.o
8obj-$(CONFIG_K8_NUMA) += k8topology_64.o 8obj-$(CONFIG_K8_NUMA) += k8topology_64.o
9obj-$(CONFIG_ACPI_NUMA) += srat_64.o 9obj-$(CONFIG_ACPI_NUMA) += srat_64.o
10
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
deleted file mode 100644
index f14da2a53ece..000000000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * arch/i386/mm/boot_ioremap.c
3 *
4 * Re-map functions for early boot-time before paging_init() when the
5 * boot-time pagetables are still in use
6 *
7 * Written by Dave Hansen <haveblue@us.ibm.com>
8 */
9
10
11/*
12 * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
13 * keeps that from happening. If anyone has a better way, I'm listening.
14 *
15 * boot_pte_t is defined only if this all works correctly
16 */
17
18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
20#include <asm/page.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <linux/init.h>
24#include <linux/stddef.h>
25
26/*
27 * I'm cheating here. It is known that the two boot PTE pages are
28 * allocated next to each other. I'm pretending that they're just
29 * one big array.
30 */
31
32#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
33
34static unsigned long boot_pte_index(unsigned long vaddr)
35{
36 return __pa(vaddr) >> PAGE_SHIFT;
37}
38
39static inline boot_pte_t* boot_vaddr_to_pte(void *address)
40{
41 boot_pte_t* boot_pg = (boot_pte_t*)pg0;
42 return &boot_pg[boot_pte_index((unsigned long)address)];
43}
44
45/*
46 * This is only for a caller who is clever enough to page-align
47 * phys_addr and virtual_source, and who also has a preference
48 * about which virtual address from which to steal ptes
49 */
50static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
51 void* virtual_source)
52{
53 boot_pte_t* pte;
54 int i;
55 char *vaddr = virtual_source;
56
57 pte = boot_vaddr_to_pte(virtual_source);
58 for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
59 set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
60 __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
61 }
62}
63
64/* the virtual space we're going to remap comes from this array */
65#define BOOT_IOREMAP_PAGES 4
66#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
67static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
68 __attribute__ ((aligned (PAGE_SIZE)));
69
70/*
71 * This only applies to things which need to ioremap before paging_init()
72 * bt_ioremap() and plain ioremap() are both useless at this point.
73 *
74 * When used, we're still using the boot-time pagetables, which only
75 * have 2 PTE pages mapping the first 8MB
76 *
77 * There is no unmap. The boot-time PTE pages aren't used after boot.
78 * If you really want the space back, just remap it yourself.
79 * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
80 */
81__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
82{
83 unsigned long last_addr, offset;
84 unsigned int nrpages;
85
86 last_addr = phys_addr + size - 1;
87
88 /* page align the requested address */
89 offset = phys_addr & ~PAGE_MASK;
90 phys_addr &= PAGE_MASK;
91 size = PAGE_ALIGN(last_addr) - phys_addr;
92
93 nrpages = size >> PAGE_SHIFT;
94 if (nrpages > BOOT_IOREMAP_PAGES)
95 return NULL;
96
97 __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
98
99 return &boot_ioremap_space[offset];
100}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 13a474d3c6e9..04b1d20e2613 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -32,6 +32,7 @@
32#include <linux/kexec.h> 32#include <linux/kexec.h>
33#include <linux/pfn.h> 33#include <linux/pfn.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/acpi.h>
35 36
36#include <asm/e820.h> 37#include <asm/e820.h>
37#include <asm/setup.h> 38#include <asm/setup.h>
@@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn;
103 104
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105 106
106static unsigned long node_remap_start_pfn[MAX_NUMNODES];
107unsigned long node_remap_size[MAX_NUMNODES]; 107unsigned long node_remap_size[MAX_NUMNODES];
108static unsigned long node_remap_offset[MAX_NUMNODES];
109static void *node_remap_start_vaddr[MAX_NUMNODES]; 108static void *node_remap_start_vaddr[MAX_NUMNODES];
110void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
111 110
112static void *node_remap_end_vaddr[MAX_NUMNODES];
113static void *node_remap_alloc_vaddr[MAX_NUMNODES];
114static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
115static unsigned long kva_pages; 112static unsigned long kva_pages;
116/* 113/*
@@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid)
167 } 164 }
168} 165}
169 166
167#ifdef CONFIG_DISCONTIGMEM
168/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA)
170 * is reserved and portions of nodes are mapped using it. This is to allow
171 * node-local memory to be allocated for structures that would normally require
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
173 * should be prepared to allocate from the bootmem allocator instead. This KVA
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES];
180static void *node_remap_alloc_vaddr[MAX_NUMNODES];
181static unsigned long node_remap_offset[MAX_NUMNODES];
182
170void *alloc_remap(int nid, unsigned long size) 183void *alloc_remap(int nid, unsigned long size)
171{ 184{
172 void *allocation = node_remap_alloc_vaddr[nid]; 185 void *allocation = node_remap_alloc_vaddr[nid];
@@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void)
263 return reserve_pages; 276 return reserve_pages;
264} 277}
265 278
279static void init_remap_allocator(int nid)
280{
281 node_remap_start_vaddr[nid] = pfn_to_kaddr(
282 kva_start_pfn + node_remap_offset[nid]);
283 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
284 (node_remap_size[nid] * PAGE_SIZE);
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310}
311#endif /* CONFIG_DISCONTIGMEM */
312
266extern void setup_bootmem_allocator(void); 313extern void setup_bootmem_allocator(void);
267unsigned long __init setup_memory(void) 314unsigned long __init setup_memory(void)
268{ 315{
269 int nid; 316 int nid;
270 unsigned long system_start_pfn, system_max_low_pfn; 317 unsigned long system_start_pfn, system_max_low_pfn;
318 unsigned long wasted_pages;
271 319
272 /* 320 /*
273 * When mapping a NUMA machine we allocate the node_mem_map arrays 321 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -288,11 +336,18 @@ unsigned long __init setup_memory(void)
288 336
289#ifdef CONFIG_BLK_DEV_INITRD 337#ifdef CONFIG_BLK_DEV_INITRD
290 /* Numa kva area is below the initrd */ 338 /* Numa kva area is below the initrd */
291 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) 339 if (initrd_start)
292 kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) 340 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
293 - kva_pages; 341 - kva_pages;
294#endif 342#endif
295 kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); 343
344 /*
345 * We waste pages past at the end of the KVA for no good reason other
346 * than how it is located. This is bad.
347 */
348 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
349 kva_start_pfn -= wasted_pages;
350 kva_pages += wasted_pages;
296 351
297 system_max_low_pfn = max_low_pfn = find_max_low_pfn(); 352 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
298 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", 353 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
@@ -318,19 +373,9 @@ unsigned long __init setup_memory(void)
318 printk("Low memory ends at vaddr %08lx\n", 373 printk("Low memory ends at vaddr %08lx\n",
319 (ulong) pfn_to_kaddr(max_low_pfn)); 374 (ulong) pfn_to_kaddr(max_low_pfn));
320 for_each_online_node(nid) { 375 for_each_online_node(nid) {
321 node_remap_start_vaddr[nid] = pfn_to_kaddr( 376 init_remap_allocator(nid);
322 kva_start_pfn + node_remap_offset[nid]);
323 /* Init the node remap allocator */
324 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
325 (node_remap_size[nid] * PAGE_SIZE);
326 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
327 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
328 377
329 allocate_pgdat(nid); 378 allocate_pgdat(nid);
330 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
331 (ulong) node_remap_start_vaddr[nid],
332 (ulong) pfn_to_kaddr(highstart_pfn
333 + node_remap_offset[nid] + node_remap_size[nid]));
334 } 379 }
335 printk("High memory starts at vaddr %08lx\n", 380 printk("High memory starts at vaddr %08lx\n",
336 (ulong) pfn_to_kaddr(highstart_pfn)); 381 (ulong) pfn_to_kaddr(highstart_pfn));
@@ -345,7 +390,8 @@ unsigned long __init setup_memory(void)
345 390
346void __init numa_kva_reserve(void) 391void __init numa_kva_reserve(void)
347{ 392{
348 reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); 393 if (kva_pages)
394 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
349} 395}
350 396
351void __init zone_sizes_init(void) 397void __init zone_sizes_init(void)
@@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr)
430 476
431EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
432#endif 478#endif
479
480#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
481/*
482 * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
483 *
484 * These stub functions are needed to compile 32-bit NUMA when SRAT is
485 * not set. There are functions in srat_64.c for parsing this table
486 * and it may be possible to make them common functions.
487 */
488void acpi_numa_slit_init (struct acpi_table_slit *slit)
489{
490 printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
491}
492
493void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
494{
495}
496
497void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
498{
499}
500
501void acpi_numa_arch_fixup(void)
502{
503}
504#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 000000000000..7e8db53528a7
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
1#include <linux/module.h>
2#include <linux/spinlock.h>
3#include <asm/uaccess.h>
4
5
6int fixup_exception(struct pt_regs *regs)
7{
8 const struct exception_table_entry *fixup;
9
10#ifdef CONFIG_PNPBIOS
11 if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
12 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
13 extern u32 pnp_bios_is_utter_crap;
14 pnp_bios_is_utter_crap = 1;
15 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
16 __asm__ volatile(
17 "movl %0, %%esp\n\t"
18 "jmp *%1\n\t"
19 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
20 panic("do_trap: can't hit this");
21 }
22#endif
23
24 fixup = search_exception_tables(regs->ip);
25 if (fixup) {
26 regs->ip = fixup->fixup;
27 return 1;
28 }
29
30 return 0;
31}
32
33#ifdef CONFIG_X86_64
34/*
35 * Need to defined our own search_extable on X86_64 to work around
36 * a B stepping K8 bug.
37 */
38const struct exception_table_entry *
39search_extable(const struct exception_table_entry *first,
40 const struct exception_table_entry *last,
41 unsigned long value)
42{
43 /* B stepping K8 bug */
44 if ((value >> 32) == 0)
45 value |= 0xffffffffUL << 32;
46
47 while (first <= last) {
48 const struct exception_table_entry *mid;
49 long diff;
50
51 mid = (last - first) / 2 + first;
52 diff = mid->insn - value;
53 if (diff == 0)
54 return mid;
55 else if (diff < 0)
56 first = mid+1;
57 else
58 last = mid-1;
59 }
60 return NULL;
61}
62#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
deleted file mode 100644
index 0ce4f22a2635..000000000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
1/*
2 * linux/arch/i386/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <asm/uaccess.h>
8
9int fixup_exception(struct pt_regs *regs)
10{
11 const struct exception_table_entry *fixup;
12
13#ifdef CONFIG_PNPBIOS
14 if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
15 {
16 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
17 extern u32 pnp_bios_is_utter_crap;
18 pnp_bios_is_utter_crap = 1;
19 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
20 __asm__ volatile(
21 "movl %0, %%esp\n\t"
22 "jmp *%1\n\t"
23 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
24 panic("do_trap: can't hit this");
25 }
26#endif
27
28 fixup = search_exception_tables(regs->eip);
29 if (fixup) {
30 regs->eip = fixup->fixup;
31 return 1;
32 }
33
34 return 0;
35}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
deleted file mode 100644
index 79ac6e7100af..000000000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <asm/uaccess.h>
9
10/* Simple binary search */
11const struct exception_table_entry *
12search_extable(const struct exception_table_entry *first,
13 const struct exception_table_entry *last,
14 unsigned long value)
15{
16 /* Work around a B stepping K8 bug */
17 if ((value >> 32) == 0)
18 value |= 0xffffffffUL << 32;
19
20 while (first <= last) {
21 const struct exception_table_entry *mid;
22 long diff;
23
24 mid = (last - first) / 2 + first;
25 diff = mid->insn - value;
26 if (diff == 0)
27 return mid;
28 else if (diff < 0)
29 first = mid+1;
30 else
31 last = mid-1;
32 }
33 return NULL;
34}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 000000000000..e28cc5277b16
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,986 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/desc.h>
31#include <asm/segment.h>
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
36#include <asm-generic/sections.h>
37
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
46#define PF_PROT (1<<0)
47#define PF_WRITE (1<<1)
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4)
51
52static inline int notify_page_fault(struct pt_regs *regs)
53{
54#ifdef CONFIG_KPROBES
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
68
69 return ret;
70#else
71 return 0;
72#endif
73}
74
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
88{
89 unsigned char *instr;
90 int scan_more = 1;
91 int prefetch = 0;
92 unsigned char *max_instr;
93
94#ifdef CONFIG_X86_32
95 if (!(__supported_pte_mask & _PAGE_NX))
96 return 0;
97#endif
98
99 /* If it was a exec fault on NX page, ignore */
100 if (error_code & PF_INSTR)
101 return 0;
102
103 instr = (unsigned char *)convert_ip_to_linear(current, regs);
104 max_instr = instr + 15;
105
106 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
107 return 0;
108
109 while (scan_more && instr < max_instr) {
110 unsigned char opcode;
111 unsigned char instr_hi;
112 unsigned char instr_lo;
113
114 if (probe_kernel_address(instr, opcode))
115 break;
116
117 instr_hi = opcode & 0xf0;
118 instr_lo = opcode & 0x0f;
119 instr++;
120
121 switch (instr_hi) {
122 case 0x20:
123 case 0x30:
124 /*
125 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
126 * In X86_64 long mode, the CPU will signal invalid
127 * opcode if some of these prefixes are present so
128 * X86_64 will never get here anyway
129 */
130 scan_more = ((instr_lo & 7) == 0x6);
131 break;
132#ifdef CONFIG_X86_64
133 case 0x40:
134 /*
135 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
136 * Need to figure out under what instruction mode the
137 * instruction was issued. Could check the LDT for lm,
138 * but for now it's good enough to assume that long
139 * mode only uses well known segments or kernel.
140 */
141 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
142 break;
143#endif
144 case 0x60:
145 /* 0x64 thru 0x67 are valid prefixes in all modes. */
146 scan_more = (instr_lo & 0xC) == 0x4;
147 break;
148 case 0xF0:
149 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
150 scan_more = !instr_lo || (instr_lo>>1) == 1;
151 break;
152 case 0x00:
153 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 scan_more = 0;
155
156 if (probe_kernel_address(instr, opcode))
157 break;
158 prefetch = (instr_lo == 0xF) &&
159 (opcode == 0x0D || opcode == 0x18);
160 break;
161 default:
162 scan_more = 0;
163 break;
164 }
165 }
166 return prefetch;
167}
168
169static void force_sig_info_fault(int si_signo, int si_code,
170 unsigned long address, struct task_struct *tsk)
171{
172 siginfo_t info;
173
174 info.si_signo = si_signo;
175 info.si_errno = 0;
176 info.si_code = si_code;
177 info.si_addr = (void __user *)address;
178 force_sig_info(si_signo, &info, tsk);
179}
180
181#ifdef CONFIG_X86_64
182static int bad_address(void *p)
183{
184 unsigned long dummy;
185 return probe_kernel_address((unsigned long *)p, dummy);
186}
187#endif
188
189void dump_pagetable(unsigned long address)
190{
191#ifdef CONFIG_X86_32
192 __typeof__(pte_val(__pte(0))) page;
193
194 page = read_cr3();
195 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
196#ifdef CONFIG_X86_PAE
197 printk("*pdpt = %016Lx ", page);
198 if ((page >> PAGE_SHIFT) < max_low_pfn
199 && page & _PAGE_PRESENT) {
200 page &= PAGE_MASK;
201 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
202 & (PTRS_PER_PMD - 1)];
203 printk(KERN_CONT "*pde = %016Lx ", page);
204 page &= ~_PAGE_NX;
205 }
206#else
207 printk("*pde = %08lx ", page);
208#endif
209
210 /*
211 * We must not directly access the pte in the highpte
212 * case if the page table is located in highmem.
213 * And let's rather not kmap-atomic the pte, just in case
214 * it's allocated already.
215 */
216 if ((page >> PAGE_SHIFT) < max_low_pfn
217 && (page & _PAGE_PRESENT)
218 && !(page & _PAGE_PSE)) {
219 page &= PAGE_MASK;
220 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
221 & (PTRS_PER_PTE - 1)];
222 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
223 }
224
225 printk("\n");
226#else /* CONFIG_X86_64 */
227 pgd_t *pgd;
228 pud_t *pud;
229 pmd_t *pmd;
230 pte_t *pte;
231
232 pgd = (pgd_t *)read_cr3();
233
234 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
235 pgd += pgd_index(address);
236 if (bad_address(pgd)) goto bad;
237 printk("PGD %lx ", pgd_val(*pgd));
238 if (!pgd_present(*pgd)) goto ret;
239
240 pud = pud_offset(pgd, address);
241 if (bad_address(pud)) goto bad;
242 printk("PUD %lx ", pud_val(*pud));
243 if (!pud_present(*pud)) goto ret;
244
245 pmd = pmd_offset(pud, address);
246 if (bad_address(pmd)) goto bad;
247 printk("PMD %lx ", pmd_val(*pmd));
248 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
249
250 pte = pte_offset_kernel(pmd, address);
251 if (bad_address(pte)) goto bad;
252 printk("PTE %lx", pte_val(*pte));
253ret:
254 printk("\n");
255 return;
256bad:
257 printk("BAD\n");
258#endif
259}
260
261#ifdef CONFIG_X86_32
262static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
263{
264 unsigned index = pgd_index(address);
265 pgd_t *pgd_k;
266 pud_t *pud, *pud_k;
267 pmd_t *pmd, *pmd_k;
268
269 pgd += index;
270 pgd_k = init_mm.pgd + index;
271
272 if (!pgd_present(*pgd_k))
273 return NULL;
274
275 /*
276 * set_pgd(pgd, *pgd_k); here would be useless on PAE
277 * and redundant with the set_pmd() on non-PAE. As would
278 * set_pud.
279 */
280
281 pud = pud_offset(pgd, address);
282 pud_k = pud_offset(pgd_k, address);
283 if (!pud_present(*pud_k))
284 return NULL;
285
286 pmd = pmd_offset(pud, address);
287 pmd_k = pmd_offset(pud_k, address);
288 if (!pmd_present(*pmd_k))
289 return NULL;
290 if (!pmd_present(*pmd)) {
291 set_pmd(pmd, *pmd_k);
292 arch_flush_lazy_mmu_mode();
293 } else
294 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
295 return pmd_k;
296}
297#endif
298
299#ifdef CONFIG_X86_64
300static const char errata93_warning[] =
301KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
302KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
303KERN_ERR "******* Please consider a BIOS update.\n"
304KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
305#endif
306
307/* Workaround for K8 erratum #93 & buggy BIOS.
308 BIOS SMM functions are required to use a specific workaround
309 to avoid corruption of the 64bit RIP register on C stepping K8.
310 A lot of BIOS that didn't get tested properly miss this.
311 The OS sees this as a page fault with the upper 32bits of RIP cleared.
312 Try to work around it here.
313 Note we only handle faults in kernel here.
314 Does nothing for X86_32
315 */
316static int is_errata93(struct pt_regs *regs, unsigned long address)
317{
318#ifdef CONFIG_X86_64
319 static int warned;
320 if (address != regs->ip)
321 return 0;
322 if ((address >> 32) != 0)
323 return 0;
324 address |= 0xffffffffUL << 32;
325 if ((address >= (u64)_stext && address <= (u64)_etext) ||
326 (address >= MODULES_VADDR && address <= MODULES_END)) {
327 if (!warned) {
328 printk(errata93_warning);
329 warned = 1;
330 }
331 regs->ip = address;
332 return 1;
333 }
334#endif
335 return 0;
336}
337
338/*
339 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
340 * addresses >4GB. We catch this in the page fault handler because these
341 * addresses are not reachable. Just detect this case and return. Any code
342 * segment in LDT is compatibility mode.
343 */
344static int is_errata100(struct pt_regs *regs, unsigned long address)
345{
346#ifdef CONFIG_X86_64
347 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
348 (address >> 32))
349 return 1;
350#endif
351 return 0;
352}
353
354void do_invalid_op(struct pt_regs *, unsigned long);
355
356static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
357{
358#ifdef CONFIG_X86_F00F_BUG
359 unsigned long nr;
360 /*
361 * Pentium F0 0F C7 C8 bug workaround.
362 */
363 if (boot_cpu_data.f00f_bug) {
364 nr = (address - idt_descr.address) >> 3;
365
366 if (nr == 6) {
367 do_invalid_op(regs, 0);
368 return 1;
369 }
370 }
371#endif
372 return 0;
373}
374
375static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
376 unsigned long address)
377{
378#ifdef CONFIG_X86_32
379 if (!oops_may_print())
380 return;
381#endif
382
383#ifdef CONFIG_X86_PAE
384 if (error_code & PF_INSTR) {
385 int level;
386 pte_t *pte = lookup_address(address, &level);
387
388 if (pte && pte_present(*pte) && !pte_exec(*pte))
389 printk(KERN_CRIT "kernel tried to execute "
390 "NX-protected page - exploit attempt? "
391 "(uid: %d)\n", current->uid);
392 }
393#endif
394
395 printk(KERN_ALERT "BUG: unable to handle kernel ");
396 if (address < PAGE_SIZE)
397 printk(KERN_CONT "NULL pointer dereference");
398 else
399 printk(KERN_CONT "paging request");
400#ifdef CONFIG_X86_32
401 printk(KERN_CONT " at %08lx\n", address);
402#else
403 printk(KERN_CONT " at %016lx\n", address);
404#endif
405 printk(KERN_ALERT "IP:");
406 printk_address(regs->ip, 1);
407 dump_pagetable(address);
408}
409
410#ifdef CONFIG_X86_64
411static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
412 unsigned long error_code)
413{
414 unsigned long flags = oops_begin();
415 struct task_struct *tsk;
416
417 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
418 current->comm, address);
419 dump_pagetable(address);
420 tsk = current;
421 tsk->thread.cr2 = address;
422 tsk->thread.trap_no = 14;
423 tsk->thread.error_code = error_code;
424 if (__die("Bad pagetable", regs, error_code))
425 regs = NULL;
426 oops_end(flags, regs, SIGKILL);
427}
428#endif
429
430/*
431 * Handle a spurious fault caused by a stale TLB entry. This allows
432 * us to lazily refresh the TLB when increasing the permissions of a
433 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
434 * expensive since that implies doing a full cross-processor TLB
435 * flush, even if no stale TLB entries exist on other processors.
436 * There are no security implications to leaving a stale TLB when
437 * increasing the permissions on a page.
438 */
439static int spurious_fault(unsigned long address,
440 unsigned long error_code)
441{
442 pgd_t *pgd;
443 pud_t *pud;
444 pmd_t *pmd;
445 pte_t *pte;
446
447 /* Reserved-bit violation or user access to kernel space? */
448 if (error_code & (PF_USER | PF_RSVD))
449 return 0;
450
451 pgd = init_mm.pgd + pgd_index(address);
452 if (!pgd_present(*pgd))
453 return 0;
454
455 pud = pud_offset(pgd, address);
456 if (!pud_present(*pud))
457 return 0;
458
459 pmd = pmd_offset(pud, address);
460 if (!pmd_present(*pmd))
461 return 0;
462
463 pte = pte_offset_kernel(pmd, address);
464 if (!pte_present(*pte))
465 return 0;
466
467 if ((error_code & PF_WRITE) && !pte_write(*pte))
468 return 0;
469 if ((error_code & PF_INSTR) && !pte_exec(*pte))
470 return 0;
471
472 return 1;
473}
474
475/*
476 * X86_32
477 * Handle a fault on the vmalloc or module mapping area
478 *
479 * X86_64
480 * Handle a fault on the vmalloc area
481 *
482 * This assumes no large pages in there.
483 */
484static int vmalloc_fault(unsigned long address)
485{
486#ifdef CONFIG_X86_32
487 unsigned long pgd_paddr;
488 pmd_t *pmd_k;
489 pte_t *pte_k;
490 /*
491 * Synchronize this task's top level page-table
492 * with the 'reference' page table.
493 *
494 * Do _not_ use "current" here. We might be inside
495 * an interrupt in the middle of a task switch..
496 */
497 pgd_paddr = read_cr3();
498 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
499 if (!pmd_k)
500 return -1;
501 pte_k = pte_offset_kernel(pmd_k, address);
502 if (!pte_present(*pte_k))
503 return -1;
504 return 0;
505#else
506 pgd_t *pgd, *pgd_ref;
507 pud_t *pud, *pud_ref;
508 pmd_t *pmd, *pmd_ref;
509 pte_t *pte, *pte_ref;
510
511 /* Copy kernel mappings over when needed. This can also
512 happen within a race in page table update. In the later
513 case just flush. */
514
515 pgd = pgd_offset(current->mm ?: &init_mm, address);
516 pgd_ref = pgd_offset_k(address);
517 if (pgd_none(*pgd_ref))
518 return -1;
519 if (pgd_none(*pgd))
520 set_pgd(pgd, *pgd_ref);
521 else
522 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
523
524 /* Below here mismatches are bugs because these lower tables
525 are shared */
526
527 pud = pud_offset(pgd, address);
528 pud_ref = pud_offset(pgd_ref, address);
529 if (pud_none(*pud_ref))
530 return -1;
531 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
532 BUG();
533 pmd = pmd_offset(pud, address);
534 pmd_ref = pmd_offset(pud_ref, address);
535 if (pmd_none(*pmd_ref))
536 return -1;
537 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
538 BUG();
539 pte_ref = pte_offset_kernel(pmd_ref, address);
540 if (!pte_present(*pte_ref))
541 return -1;
542 pte = pte_offset_kernel(pmd, address);
543 /* Don't use pte_page here, because the mappings can point
544 outside mem_map, and the NUMA hash lookup cannot handle
545 that. */
546 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
547 BUG();
548 return 0;
549#endif
550}
551
552int show_unhandled_signals = 1;
553
554/*
555 * This routine handles page faults. It determines the address,
556 * and the problem, and then passes it off to one of the appropriate
557 * routines.
558 */
559#ifdef CONFIG_X86_64
560asmlinkage
561#endif
562void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
563{
564 struct task_struct *tsk;
565 struct mm_struct *mm;
566 struct vm_area_struct *vma;
567 unsigned long address;
568 int write, si_code;
569 int fault;
570#ifdef CONFIG_X86_64
571 unsigned long flags;
572#endif
573
574 /*
575 * We can fault from pretty much anywhere, with unknown IRQ state.
576 */
577 trace_hardirqs_fixup();
578
579 tsk = current;
580 mm = tsk->mm;
581 prefetchw(&mm->mmap_sem);
582
583 /* get the address */
584 address = read_cr2();
585
586 si_code = SEGV_MAPERR;
587
588 if (notify_page_fault(regs))
589 return;
590
591 /*
592 * We fault-in kernel-space virtual memory on-demand. The
593 * 'reference' page table is init_mm.pgd.
594 *
595 * NOTE! We MUST NOT take any locks for this case. We may
596 * be in an interrupt or a critical region, and should
597 * only copy the information from the master page table,
598 * nothing more.
599 *
600 * This verifies that the fault happens in kernel space
601 * (error_code & 4) == 0, and that the fault was not a
602 * protection error (error_code & 9) == 0.
603 */
604#ifdef CONFIG_X86_32
605 if (unlikely(address >= TASK_SIZE)) {
606 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
607 vmalloc_fault(address) >= 0)
608 return;
609
610 /* Can handle a stale RO->RW TLB */
611 if (spurious_fault(address, error_code))
612 return;
613
614 /*
615 * Don't take the mm semaphore here. If we fixup a prefetch
616 * fault we could otherwise deadlock.
617 */
618 goto bad_area_nosemaphore;
619 }
620
621 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
622 fault has been handled. */
623 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
624 local_irq_enable();
625
626 /*
627 * If we're in an interrupt, have no user context or are running in an
628 * atomic region then we must not take the fault.
629 */
630 if (in_atomic() || !mm)
631 goto bad_area_nosemaphore;
632#else /* CONFIG_X86_64 */
633 if (unlikely(address >= TASK_SIZE64)) {
634 /*
635 * Don't check for the module range here: its PML4
636 * is always initialized because it's shared with the main
637 * kernel text. Only vmalloc may need PML4 syncups.
638 */
639 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
640 ((address >= VMALLOC_START && address < VMALLOC_END))) {
641 if (vmalloc_fault(address) >= 0)
642 return;
643 }
644
645 /* Can handle a stale RO->RW TLB */
646 if (spurious_fault(address, error_code))
647 return;
648
649 /*
650 * Don't take the mm semaphore here. If we fixup a prefetch
651 * fault we could otherwise deadlock.
652 */
653 goto bad_area_nosemaphore;
654 }
655 if (likely(regs->flags & X86_EFLAGS_IF))
656 local_irq_enable();
657
658 if (unlikely(error_code & PF_RSVD))
659 pgtable_bad(address, regs, error_code);
660
661 /*
662 * If we're in an interrupt, have no user context or are running in an
663 * atomic region then we must not take the fault.
664 */
665 if (unlikely(in_atomic() || !mm))
666 goto bad_area_nosemaphore;
667
668 /*
669 * User-mode registers count as a user access even for any
670 * potential system fault or CPU buglet.
671 */
672 if (user_mode_vm(regs))
673 error_code |= PF_USER;
674again:
675#endif
676 /* When running in the kernel we expect faults to occur only to
677 * addresses in user space. All other faults represent errors in the
678 * kernel and should generate an OOPS. Unfortunately, in the case of an
679 * erroneous fault occurring in a code path which already holds mmap_sem
680 * we will deadlock attempting to validate the fault against the
681 * address space. Luckily the kernel only validly references user
682 * space from well defined areas of code, which are listed in the
683 * exceptions table.
684 *
685 * As the vast majority of faults will be valid we will only perform
686 * the source reference check when there is a possibility of a deadlock.
687 * Attempt to lock the address space, if we cannot we then validate the
688 * source. If this is invalid we can skip the address space check,
689 * thus avoiding the deadlock.
690 */
691 if (!down_read_trylock(&mm->mmap_sem)) {
692 if ((error_code & PF_USER) == 0 &&
693 !search_exception_tables(regs->ip))
694 goto bad_area_nosemaphore;
695 down_read(&mm->mmap_sem);
696 }
697
698 vma = find_vma(mm, address);
699 if (!vma)
700 goto bad_area;
701 if (vma->vm_start <= address)
702 goto good_area;
703 if (!(vma->vm_flags & VM_GROWSDOWN))
704 goto bad_area;
705 if (error_code & PF_USER) {
706 /*
707 * Accessing the stack below %sp is always a bug.
708 * The large cushion allows instructions like enter
709 * and pusha to work. ("enter $65535,$31" pushes
710 * 32 pointers and then decrements %sp by 65535.)
711 */
712 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
713 goto bad_area;
714 }
715 if (expand_stack(vma, address))
716 goto bad_area;
717/*
718 * Ok, we have a good vm_area for this memory access, so
719 * we can handle it..
720 */
721good_area:
722 si_code = SEGV_ACCERR;
723 write = 0;
724 switch (error_code & (PF_PROT|PF_WRITE)) {
725 default: /* 3: write, present */
726 /* fall through */
727 case PF_WRITE: /* write, not present */
728 if (!(vma->vm_flags & VM_WRITE))
729 goto bad_area;
730 write++;
731 break;
732 case PF_PROT: /* read, present */
733 goto bad_area;
734 case 0: /* read, not present */
735 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
736 goto bad_area;
737 }
738
739#ifdef CONFIG_X86_32
740survive:
741#endif
742 /*
743 * If for any reason at all we couldn't handle the fault,
744 * make sure we exit gracefully rather than endlessly redo
745 * the fault.
746 */
747 fault = handle_mm_fault(mm, vma, address, write);
748 if (unlikely(fault & VM_FAULT_ERROR)) {
749 if (fault & VM_FAULT_OOM)
750 goto out_of_memory;
751 else if (fault & VM_FAULT_SIGBUS)
752 goto do_sigbus;
753 BUG();
754 }
755 if (fault & VM_FAULT_MAJOR)
756 tsk->maj_flt++;
757 else
758 tsk->min_flt++;
759
760#ifdef CONFIG_X86_32
761 /*
762 * Did it hit the DOS screen memory VA from vm86 mode?
763 */
764 if (v8086_mode(regs)) {
765 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
766 if (bit < 32)
767 tsk->thread.screen_bitmap |= 1 << bit;
768 }
769#endif
770 up_read(&mm->mmap_sem);
771 return;
772
773/*
774 * Something tried to access memory that isn't in our memory map..
775 * Fix it, but check if it's kernel or user first..
776 */
777bad_area:
778 up_read(&mm->mmap_sem);
779
780bad_area_nosemaphore:
781 /* User mode accesses just cause a SIGSEGV */
782 if (error_code & PF_USER) {
783 /*
784 * It's possible to have interrupts off here.
785 */
786 local_irq_enable();
787
788 /*
789 * Valid to do another page fault here because this one came
790 * from user space.
791 */
792 if (is_prefetch(regs, address, error_code))
793 return;
794
795 if (is_errata100(regs, address))
796 return;
797
798 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
799 printk_ratelimit()) {
800 printk(
801#ifdef CONFIG_X86_32
802 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
803#else
804 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
805#endif
806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
807 tsk->comm, task_pid_nr(tsk), address, regs->ip,
808 regs->sp, error_code);
809 print_vma_addr(" in ", regs->ip);
810 printk("\n");
811 }
812
813 tsk->thread.cr2 = address;
814 /* Kernel addresses are always protection faults */
815 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
816 tsk->thread.trap_no = 14;
817 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
818 return;
819 }
820
821 if (is_f00f_bug(regs, address))
822 return;
823
824no_context:
825 /* Are we prepared to handle this kernel fault? */
826 if (fixup_exception(regs))
827 return;
828
829 /*
830 * X86_32
831 * Valid to do another page fault here, because if this fault
832 * had been triggered by is_prefetch fixup_exception would have
833 * handled it.
834 *
835 * X86_64
836 * Hall of shame of CPU/BIOS bugs.
837 */
838 if (is_prefetch(regs, address, error_code))
839 return;
840
841 if (is_errata93(regs, address))
842 return;
843
844/*
845 * Oops. The kernel tried to access some bad page. We'll have to
846 * terminate things with extreme prejudice.
847 */
848#ifdef CONFIG_X86_32
849 bust_spinlocks(1);
850#else
851 flags = oops_begin();
852#endif
853
854 show_fault_oops(regs, error_code, address);
855
856 tsk->thread.cr2 = address;
857 tsk->thread.trap_no = 14;
858 tsk->thread.error_code = error_code;
859
860#ifdef CONFIG_X86_32
861 die("Oops", regs, error_code);
862 bust_spinlocks(0);
863 do_exit(SIGKILL);
864#else
865 if (__die("Oops", regs, error_code))
866 regs = NULL;
867 /* Executive summary in case the body of the oops scrolled away */
868 printk(KERN_EMERG "CR2: %016lx\n", address);
869 oops_end(flags, regs, SIGKILL);
870#endif
871
872/*
873 * We ran out of memory, or some other thing happened to us that made
874 * us unable to handle the page fault gracefully.
875 */
876out_of_memory:
877 up_read(&mm->mmap_sem);
878 if (is_global_init(tsk)) {
879 yield();
880#ifdef CONFIG_X86_32
881 down_read(&mm->mmap_sem);
882 goto survive;
883#else
884 goto again;
885#endif
886 }
887
888 printk("VM: killing process %s\n", tsk->comm);
889 if (error_code & PF_USER)
890 do_group_exit(SIGKILL);
891 goto no_context;
892
893do_sigbus:
894 up_read(&mm->mmap_sem);
895
896 /* Kernel mode? Handle exceptions or die */
897 if (!(error_code & PF_USER))
898 goto no_context;
899#ifdef CONFIG_X86_32
900 /* User space => ok to do another page fault */
901 if (is_prefetch(regs, address, error_code))
902 return;
903#endif
904 tsk->thread.cr2 = address;
905 tsk->thread.error_code = error_code;
906 tsk->thread.trap_no = 14;
907 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
908}
909
910DEFINE_SPINLOCK(pgd_lock);
911LIST_HEAD(pgd_list);
912
913void vmalloc_sync_all(void)
914{
915#ifdef CONFIG_X86_32
916 /*
917 * Note that races in the updates of insync and start aren't
918 * problematic: insync can only get set bits added, and updates to
919 * start are only improving performance (without affecting correctness
920 * if undone).
921 */
922 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
923 static unsigned long start = TASK_SIZE;
924 unsigned long address;
925
926 if (SHARED_KERNEL_PMD)
927 return;
928
929 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
930 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
931 if (!test_bit(pgd_index(address), insync)) {
932 unsigned long flags;
933 struct page *page;
934
935 spin_lock_irqsave(&pgd_lock, flags);
936 list_for_each_entry(page, &pgd_list, lru) {
937 if (!vmalloc_sync_one(page_address(page),
938 address))
939 break;
940 }
941 spin_unlock_irqrestore(&pgd_lock, flags);
942 if (!page)
943 set_bit(pgd_index(address), insync);
944 }
945 if (address == start && test_bit(pgd_index(address), insync))
946 start = address + PGDIR_SIZE;
947 }
948#else /* CONFIG_X86_64 */
949 /*
950 * Note that races in the updates of insync and start aren't
951 * problematic: insync can only get set bits added, and updates to
952 * start are only improving performance (without affecting correctness
953 * if undone).
954 */
955 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
956 static unsigned long start = VMALLOC_START & PGDIR_MASK;
957 unsigned long address;
958
959 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
960 if (!test_bit(pgd_index(address), insync)) {
961 const pgd_t *pgd_ref = pgd_offset_k(address);
962 struct page *page;
963
964 if (pgd_none(*pgd_ref))
965 continue;
966 spin_lock(&pgd_lock);
967 list_for_each_entry(page, &pgd_list, lru) {
968 pgd_t *pgd;
969 pgd = (pgd_t *)page_address(page) + pgd_index(address);
970 if (pgd_none(*pgd))
971 set_pgd(pgd, *pgd_ref);
972 else
973 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
974 }
975 spin_unlock(&pgd_lock);
976 set_bit(pgd_index(address), insync);
977 }
978 if (address == start)
979 start = address + PGDIR_SIZE;
980 }
981 /* Check that there is no need to do the same for the modules area. */
982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
984 (__START_KERNEL & PGDIR_MASK)));
985#endif
986}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
deleted file mode 100644
index a2273d44aa27..000000000000
--- a/arch/x86/mm/fault_32.c
+++ /dev/null
@@ -1,659 +0,0 @@
1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28#include <linux/kprobes.h>
29
30#include <asm/system.h>
31#include <asm/desc.h>
32#include <asm/segment.h>
33
34extern void die(const char *,struct pt_regs *,long);
35
36#ifdef CONFIG_KPROBES
37static inline int notify_page_fault(struct pt_regs *regs)
38{
39 int ret = 0;
40
41 /* kprobe_running() needs smp_processor_id() */
42 if (!user_mode_vm(regs)) {
43 preempt_disable();
44 if (kprobe_running() && kprobe_fault_handler(regs, 14))
45 ret = 1;
46 preempt_enable();
47 }
48
49 return ret;
50}
51#else
52static inline int notify_page_fault(struct pt_regs *regs)
53{
54 return 0;
55}
56#endif
57
58/*
59 * Return EIP plus the CS segment base. The segment limit is also
60 * adjusted, clamped to the kernel/user address space (whichever is
61 * appropriate), and returned in *eip_limit.
62 *
63 * The segment is checked, because it might have been changed by another
64 * task between the original faulting instruction and here.
65 *
66 * If CS is no longer a valid code segment, or if EIP is beyond the
67 * limit, or if it is a kernel address when CS is not a kernel segment,
68 * then the returned value will be greater than *eip_limit.
69 *
70 * This is slow, but is very rarely executed.
71 */
72static inline unsigned long get_segment_eip(struct pt_regs *regs,
73 unsigned long *eip_limit)
74{
75 unsigned long eip = regs->eip;
76 unsigned seg = regs->xcs & 0xffff;
77 u32 seg_ar, seg_limit, base, *desc;
78
79 /* Unlikely, but must come before segment checks. */
80 if (unlikely(regs->eflags & VM_MASK)) {
81 base = seg << 4;
82 *eip_limit = base + 0xffff;
83 return base + (eip & 0xffff);
84 }
85
86 /* The standard kernel/user address space limit. */
87 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
88
89 /* By far the most common cases. */
90 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
91 return eip;
92
93 /* Check the segment exists, is within the current LDT/GDT size,
94 that kernel/user (ring 0..3) has the appropriate privilege,
95 that it's a code segment, and get the limit. */
96 __asm__ ("larl %3,%0; lsll %3,%1"
97 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
98 if ((~seg_ar & 0x9800) || eip > seg_limit) {
99 *eip_limit = 0;
100 return 1; /* So that returned eip > *eip_limit. */
101 }
102
103 /* Get the GDT/LDT descriptor base.
104 When you look for races in this code remember that
105 LDT and other horrors are only used in user space. */
106 if (seg & (1<<2)) {
107 /* Must lock the LDT while reading it. */
108 mutex_lock(&current->mm->context.lock);
109 desc = current->mm->context.ldt;
110 desc = (void *)desc + (seg & ~7);
111 } else {
112 /* Must disable preemption while reading the GDT. */
113 desc = (u32 *)get_cpu_gdt_table(get_cpu());
114 desc = (void *)desc + (seg & ~7);
115 }
116
117 /* Decode the code segment base from the descriptor */
118 base = get_desc_base((unsigned long *)desc);
119
120 if (seg & (1<<2)) {
121 mutex_unlock(&current->mm->context.lock);
122 } else
123 put_cpu();
124
125 /* Adjust EIP and segment limit, and clamp at the kernel limit.
126 It's legitimate for segments to wrap at 0xffffffff. */
127 seg_limit += base;
128 if (seg_limit < *eip_limit && seg_limit >= base)
129 *eip_limit = seg_limit;
130 return eip + base;
131}
132
133/*
134 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
135 * Check that here and ignore it.
136 */
137static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
138{
139 unsigned long limit;
140 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
141 int scan_more = 1;
142 int prefetch = 0;
143 int i;
144
145 for (i = 0; scan_more && i < 15; i++) {
146 unsigned char opcode;
147 unsigned char instr_hi;
148 unsigned char instr_lo;
149
150 if (instr > (unsigned char *)limit)
151 break;
152 if (probe_kernel_address(instr, opcode))
153 break;
154
155 instr_hi = opcode & 0xf0;
156 instr_lo = opcode & 0x0f;
157 instr++;
158
159 switch (instr_hi) {
160 case 0x20:
161 case 0x30:
162 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
163 scan_more = ((instr_lo & 7) == 0x6);
164 break;
165
166 case 0x60:
167 /* 0x64 thru 0x67 are valid prefixes in all modes. */
168 scan_more = (instr_lo & 0xC) == 0x4;
169 break;
170 case 0xF0:
171 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
172 scan_more = !instr_lo || (instr_lo>>1) == 1;
173 break;
174 case 0x00:
175 /* Prefetch instruction is 0x0F0D or 0x0F18 */
176 scan_more = 0;
177 if (instr > (unsigned char *)limit)
178 break;
179 if (probe_kernel_address(instr, opcode))
180 break;
181 prefetch = (instr_lo == 0xF) &&
182 (opcode == 0x0D || opcode == 0x18);
183 break;
184 default:
185 scan_more = 0;
186 break;
187 }
188 }
189 return prefetch;
190}
191
192static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
193 unsigned long error_code)
194{
195 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
196 boot_cpu_data.x86 >= 6)) {
197 /* Catch an obscure case of prefetch inside an NX page. */
198 if (nx_enabled && (error_code & 16))
199 return 0;
200 return __is_prefetch(regs, addr);
201 }
202 return 0;
203}
204
205static noinline void force_sig_info_fault(int si_signo, int si_code,
206 unsigned long address, struct task_struct *tsk)
207{
208 siginfo_t info;
209
210 info.si_signo = si_signo;
211 info.si_errno = 0;
212 info.si_code = si_code;
213 info.si_addr = (void __user *)address;
214 force_sig_info(si_signo, &info, tsk);
215}
216
217fastcall void do_invalid_op(struct pt_regs *, unsigned long);
218
219static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
220{
221 unsigned index = pgd_index(address);
222 pgd_t *pgd_k;
223 pud_t *pud, *pud_k;
224 pmd_t *pmd, *pmd_k;
225
226 pgd += index;
227 pgd_k = init_mm.pgd + index;
228
229 if (!pgd_present(*pgd_k))
230 return NULL;
231
232 /*
233 * set_pgd(pgd, *pgd_k); here would be useless on PAE
234 * and redundant with the set_pmd() on non-PAE. As would
235 * set_pud.
236 */
237
238 pud = pud_offset(pgd, address);
239 pud_k = pud_offset(pgd_k, address);
240 if (!pud_present(*pud_k))
241 return NULL;
242
243 pmd = pmd_offset(pud, address);
244 pmd_k = pmd_offset(pud_k, address);
245 if (!pmd_present(*pmd_k))
246 return NULL;
247 if (!pmd_present(*pmd)) {
248 set_pmd(pmd, *pmd_k);
249 arch_flush_lazy_mmu_mode();
250 } else
251 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
252 return pmd_k;
253}
254
255/*
256 * Handle a fault on the vmalloc or module mapping area
257 *
258 * This assumes no large pages in there.
259 */
260static inline int vmalloc_fault(unsigned long address)
261{
262 unsigned long pgd_paddr;
263 pmd_t *pmd_k;
264 pte_t *pte_k;
265 /*
266 * Synchronize this task's top level page-table
267 * with the 'reference' page table.
268 *
269 * Do _not_ use "current" here. We might be inside
270 * an interrupt in the middle of a task switch..
271 */
272 pgd_paddr = read_cr3();
273 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
274 if (!pmd_k)
275 return -1;
276 pte_k = pte_offset_kernel(pmd_k, address);
277 if (!pte_present(*pte_k))
278 return -1;
279 return 0;
280}
281
282int show_unhandled_signals = 1;
283
284/*
285 * This routine handles page faults. It determines the address,
286 * and the problem, and then passes it off to one of the appropriate
287 * routines.
288 *
289 * error_code:
290 * bit 0 == 0 means no page found, 1 means protection fault
291 * bit 1 == 0 means read, 1 means write
292 * bit 2 == 0 means kernel, 1 means user-mode
293 * bit 3 == 1 means use of reserved bit detected
294 * bit 4 == 1 means fault was an instruction fetch
295 */
296fastcall void __kprobes do_page_fault(struct pt_regs *regs,
297 unsigned long error_code)
298{
299 struct task_struct *tsk;
300 struct mm_struct *mm;
301 struct vm_area_struct * vma;
302 unsigned long address;
303 int write, si_code;
304 int fault;
305
306 /*
307 * We can fault from pretty much anywhere, with unknown IRQ state.
308 */
309 trace_hardirqs_fixup();
310
311 /* get the address */
312 address = read_cr2();
313
314 tsk = current;
315
316 si_code = SEGV_MAPERR;
317
318 /*
319 * We fault-in kernel-space virtual memory on-demand. The
320 * 'reference' page table is init_mm.pgd.
321 *
322 * NOTE! We MUST NOT take any locks for this case. We may
323 * be in an interrupt or a critical region, and should
324 * only copy the information from the master page table,
325 * nothing more.
326 *
327 * This verifies that the fault happens in kernel space
328 * (error_code & 4) == 0, and that the fault was not a
329 * protection error (error_code & 9) == 0.
330 */
331 if (unlikely(address >= TASK_SIZE)) {
332 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
333 return;
334 if (notify_page_fault(regs))
335 return;
336 /*
337 * Don't take the mm semaphore here. If we fixup a prefetch
338 * fault we could otherwise deadlock.
339 */
340 goto bad_area_nosemaphore;
341 }
342
343 if (notify_page_fault(regs))
344 return;
345
346 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
347 fault has been handled. */
348 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
349 local_irq_enable();
350
351 mm = tsk->mm;
352
353 /*
354 * If we're in an interrupt, have no user context or are running in an
355 * atomic region then we must not take the fault..
356 */
357 if (in_atomic() || !mm)
358 goto bad_area_nosemaphore;
359
360 /* When running in the kernel we expect faults to occur only to
361 * addresses in user space. All other faults represent errors in the
362 * kernel and should generate an OOPS. Unfortunately, in the case of an
363 * erroneous fault occurring in a code path which already holds mmap_sem
364 * we will deadlock attempting to validate the fault against the
365 * address space. Luckily the kernel only validly references user
366 * space from well defined areas of code, which are listed in the
367 * exceptions table.
368 *
369 * As the vast majority of faults will be valid we will only perform
370 * the source reference check when there is a possibility of a deadlock.
371 * Attempt to lock the address space, if we cannot we then validate the
372 * source. If this is invalid we can skip the address space check,
373 * thus avoiding the deadlock.
374 */
375 if (!down_read_trylock(&mm->mmap_sem)) {
376 if ((error_code & 4) == 0 &&
377 !search_exception_tables(regs->eip))
378 goto bad_area_nosemaphore;
379 down_read(&mm->mmap_sem);
380 }
381
382 vma = find_vma(mm, address);
383 if (!vma)
384 goto bad_area;
385 if (vma->vm_start <= address)
386 goto good_area;
387 if (!(vma->vm_flags & VM_GROWSDOWN))
388 goto bad_area;
389 if (error_code & 4) {
390 /*
391 * Accessing the stack below %esp is always a bug.
392 * The large cushion allows instructions like enter
393 * and pusha to work. ("enter $65535,$31" pushes
394 * 32 pointers and then decrements %esp by 65535.)
395 */
396 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
397 goto bad_area;
398 }
399 if (expand_stack(vma, address))
400 goto bad_area;
401/*
402 * Ok, we have a good vm_area for this memory access, so
403 * we can handle it..
404 */
405good_area:
406 si_code = SEGV_ACCERR;
407 write = 0;
408 switch (error_code & 3) {
409 default: /* 3: write, present */
410 /* fall through */
411 case 2: /* write, not present */
412 if (!(vma->vm_flags & VM_WRITE))
413 goto bad_area;
414 write++;
415 break;
416 case 1: /* read, present */
417 goto bad_area;
418 case 0: /* read, not present */
419 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
420 goto bad_area;
421 }
422
423 survive:
424 /*
425 * If for any reason at all we couldn't handle the fault,
426 * make sure we exit gracefully rather than endlessly redo
427 * the fault.
428 */
429 fault = handle_mm_fault(mm, vma, address, write);
430 if (unlikely(fault & VM_FAULT_ERROR)) {
431 if (fault & VM_FAULT_OOM)
432 goto out_of_memory;
433 else if (fault & VM_FAULT_SIGBUS)
434 goto do_sigbus;
435 BUG();
436 }
437 if (fault & VM_FAULT_MAJOR)
438 tsk->maj_flt++;
439 else
440 tsk->min_flt++;
441
442 /*
443 * Did it hit the DOS screen memory VA from vm86 mode?
444 */
445 if (regs->eflags & VM_MASK) {
446 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
447 if (bit < 32)
448 tsk->thread.screen_bitmap |= 1 << bit;
449 }
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461 /* User mode accesses just cause a SIGSEGV */
462 if (error_code & 4) {
463 /*
464 * It's possible to have interrupts off here.
465 */
466 local_irq_enable();
467
468 /*
469 * Valid to do another page fault here because this one came
470 * from user space.
471 */
472 if (is_prefetch(regs, address, error_code))
473 return;
474
475 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
476 printk_ratelimit()) {
477 printk("%s%s[%d]: segfault at %08lx eip %08lx "
478 "esp %08lx error %lx\n",
479 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
480 tsk->comm, task_pid_nr(tsk), address, regs->eip,
481 regs->esp, error_code);
482 }
483 tsk->thread.cr2 = address;
484 /* Kernel addresses are always protection faults */
485 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
486 tsk->thread.trap_no = 14;
487 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
488 return;
489 }
490
491#ifdef CONFIG_X86_F00F_BUG
492 /*
493 * Pentium F0 0F C7 C8 bug workaround.
494 */
495 if (boot_cpu_data.f00f_bug) {
496 unsigned long nr;
497
498 nr = (address - idt_descr.address) >> 3;
499
500 if (nr == 6) {
501 do_invalid_op(regs, 0);
502 return;
503 }
504 }
505#endif
506
507no_context:
508 /* Are we prepared to handle this kernel fault? */
509 if (fixup_exception(regs))
510 return;
511
512 /*
513 * Valid to do another page fault here, because if this fault
514 * had been triggered by is_prefetch fixup_exception would have
515 * handled it.
516 */
517 if (is_prefetch(regs, address, error_code))
518 return;
519
520/*
521 * Oops. The kernel tried to access some bad page. We'll have to
522 * terminate things with extreme prejudice.
523 */
524
525 bust_spinlocks(1);
526
527 if (oops_may_print()) {
528 __typeof__(pte_val(__pte(0))) page;
529
530#ifdef CONFIG_X86_PAE
531 if (error_code & 16) {
532 pte_t *pte = lookup_address(address);
533
534 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
535 printk(KERN_CRIT "kernel tried to execute "
536 "NX-protected page - exploit attempt? "
537 "(uid: %d)\n", current->uid);
538 }
539#endif
540 if (address < PAGE_SIZE)
541 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
542 "pointer dereference");
543 else
544 printk(KERN_ALERT "BUG: unable to handle kernel paging"
545 " request");
546 printk(" at virtual address %08lx\n",address);
547 printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
548
549 page = read_cr3();
550 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
551#ifdef CONFIG_X86_PAE
552 printk("*pdpt = %016Lx ", page);
553 if ((page >> PAGE_SHIFT) < max_low_pfn
554 && page & _PAGE_PRESENT) {
555 page &= PAGE_MASK;
556 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
557 & (PTRS_PER_PMD - 1)];
558 printk(KERN_CONT "*pde = %016Lx ", page);
559 page &= ~_PAGE_NX;
560 }
561#else
562 printk("*pde = %08lx ", page);
563#endif
564
565 /*
566 * We must not directly access the pte in the highpte
567 * case if the page table is located in highmem.
568 * And let's rather not kmap-atomic the pte, just in case
569 * it's allocated already.
570 */
571 if ((page >> PAGE_SHIFT) < max_low_pfn
572 && (page & _PAGE_PRESENT)
573 && !(page & _PAGE_PSE)) {
574 page &= PAGE_MASK;
575 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
576 & (PTRS_PER_PTE - 1)];
577 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
578 }
579
580 printk("\n");
581 }
582
583 tsk->thread.cr2 = address;
584 tsk->thread.trap_no = 14;
585 tsk->thread.error_code = error_code;
586 die("Oops", regs, error_code);
587 bust_spinlocks(0);
588 do_exit(SIGKILL);
589
590/*
591 * We ran out of memory, or some other thing happened to us that made
592 * us unable to handle the page fault gracefully.
593 */
594out_of_memory:
595 up_read(&mm->mmap_sem);
596 if (is_global_init(tsk)) {
597 yield();
598 down_read(&mm->mmap_sem);
599 goto survive;
600 }
601 printk("VM: killing process %s\n", tsk->comm);
602 if (error_code & 4)
603 do_group_exit(SIGKILL);
604 goto no_context;
605
606do_sigbus:
607 up_read(&mm->mmap_sem);
608
609 /* Kernel mode? Handle exceptions or die */
610 if (!(error_code & 4))
611 goto no_context;
612
613 /* User space => ok to do another page fault */
614 if (is_prefetch(regs, address, error_code))
615 return;
616
617 tsk->thread.cr2 = address;
618 tsk->thread.error_code = error_code;
619 tsk->thread.trap_no = 14;
620 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
621}
622
623void vmalloc_sync_all(void)
624{
625 /*
626 * Note that races in the updates of insync and start aren't
627 * problematic: insync can only get set bits added, and updates to
628 * start are only improving performance (without affecting correctness
629 * if undone).
630 */
631 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
632 static unsigned long start = TASK_SIZE;
633 unsigned long address;
634
635 if (SHARED_KERNEL_PMD)
636 return;
637
638 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
639 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
640 if (!test_bit(pgd_index(address), insync)) {
641 unsigned long flags;
642 struct page *page;
643
644 spin_lock_irqsave(&pgd_lock, flags);
645 for (page = pgd_list; page; page =
646 (struct page *)page->index)
647 if (!vmalloc_sync_one(page_address(page),
648 address)) {
649 BUG_ON(page != pgd_list);
650 break;
651 }
652 spin_unlock_irqrestore(&pgd_lock, flags);
653 if (!page)
654 set_bit(pgd_index(address), insync);
655 }
656 if (address == start && test_bit(pgd_index(address), insync))
657 start = address + PGDIR_SIZE;
658 }
659}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
deleted file mode 100644
index 0e26230669ca..000000000000
--- a/arch/x86/mm/fault_64.c
+++ /dev/null
@@ -1,623 +0,0 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/compiler.h>
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28#include <linux/kprobes.h>
29
30#include <asm/system.h>
31#include <asm/pgalloc.h>
32#include <asm/smp.h>
33#include <asm/tlbflush.h>
34#include <asm/proto.h>
35#include <asm-generic/sections.h>
36
37/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
44#ifdef CONFIG_KPROBES
45static inline int notify_page_fault(struct pt_regs *regs)
46{
47 int ret = 0;
48
49 /* kprobe_running() needs smp_processor_id() */
50 if (!user_mode(regs)) {
51 preempt_disable();
52 if (kprobe_running() && kprobe_fault_handler(regs, 14))
53 ret = 1;
54 preempt_enable();
55 }
56
57 return ret;
58}
59#else
60static inline int notify_page_fault(struct pt_regs *regs)
61{
62 return 0;
63}
64#endif
65
66/* Sometimes the CPU reports invalid exceptions on prefetch.
67 Check that here and ignore.
68 Opcode checker based on code by Richard Brunner */
69static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
70 unsigned long error_code)
71{
72 unsigned char *instr;
73 int scan_more = 1;
74 int prefetch = 0;
75 unsigned char *max_instr;
76
77 /* If it was a exec fault ignore */
78 if (error_code & PF_INSTR)
79 return 0;
80
81 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
82 max_instr = instr + 15;
83
84 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
85 return 0;
86
87 while (scan_more && instr < max_instr) {
88 unsigned char opcode;
89 unsigned char instr_hi;
90 unsigned char instr_lo;
91
92 if (probe_kernel_address(instr, opcode))
93 break;
94
95 instr_hi = opcode & 0xf0;
96 instr_lo = opcode & 0x0f;
97 instr++;
98
99 switch (instr_hi) {
100 case 0x20:
101 case 0x30:
102 /* Values 0x26,0x2E,0x36,0x3E are valid x86
103 prefixes. In long mode, the CPU will signal
104 invalid opcode if some of these prefixes are
105 present so we will never get here anyway */
106 scan_more = ((instr_lo & 7) == 0x6);
107 break;
108
109 case 0x40:
110 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
111 Need to figure out under what instruction mode the
112 instruction was issued ... */
113 /* Could check the LDT for lm, but for now it's good
114 enough to assume that long mode only uses well known
115 segments or kernel. */
116 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
117 break;
118
119 case 0x60:
120 /* 0x64 thru 0x67 are valid prefixes in all modes. */
121 scan_more = (instr_lo & 0xC) == 0x4;
122 break;
123 case 0xF0:
124 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
125 scan_more = !instr_lo || (instr_lo>>1) == 1;
126 break;
127 case 0x00:
128 /* Prefetch instruction is 0x0F0D or 0x0F18 */
129 scan_more = 0;
130 if (probe_kernel_address(instr, opcode))
131 break;
132 prefetch = (instr_lo == 0xF) &&
133 (opcode == 0x0D || opcode == 0x18);
134 break;
135 default:
136 scan_more = 0;
137 break;
138 }
139 }
140 return prefetch;
141}
142
143static int bad_address(void *p)
144{
145 unsigned long dummy;
146 return probe_kernel_address((unsigned long *)p, dummy);
147}
148
149void dump_pagetable(unsigned long address)
150{
151 pgd_t *pgd;
152 pud_t *pud;
153 pmd_t *pmd;
154 pte_t *pte;
155
156 pgd = (pgd_t *)read_cr3();
157
158 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
159 pgd += pgd_index(address);
160 if (bad_address(pgd)) goto bad;
161 printk("PGD %lx ", pgd_val(*pgd));
162 if (!pgd_present(*pgd)) goto ret;
163
164 pud = pud_offset(pgd, address);
165 if (bad_address(pud)) goto bad;
166 printk("PUD %lx ", pud_val(*pud));
167 if (!pud_present(*pud)) goto ret;
168
169 pmd = pmd_offset(pud, address);
170 if (bad_address(pmd)) goto bad;
171 printk("PMD %lx ", pmd_val(*pmd));
172 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
173
174 pte = pte_offset_kernel(pmd, address);
175 if (bad_address(pte)) goto bad;
176 printk("PTE %lx", pte_val(*pte));
177ret:
178 printk("\n");
179 return;
180bad:
181 printk("BAD\n");
182}
183
184static const char errata93_warning[] =
185KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
186KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
187KERN_ERR "******* Please consider a BIOS update.\n"
188KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
189
190/* Workaround for K8 erratum #93 & buggy BIOS.
191 BIOS SMM functions are required to use a specific workaround
192 to avoid corruption of the 64bit RIP register on C stepping K8.
193 A lot of BIOS that didn't get tested properly miss this.
194 The OS sees this as a page fault with the upper 32bits of RIP cleared.
195 Try to work around it here.
196 Note we only handle faults in kernel here. */
197
198static int is_errata93(struct pt_regs *regs, unsigned long address)
199{
200 static int warned;
201 if (address != regs->rip)
202 return 0;
203 if ((address >> 32) != 0)
204 return 0;
205 address |= 0xffffffffUL << 32;
206 if ((address >= (u64)_stext && address <= (u64)_etext) ||
207 (address >= MODULES_VADDR && address <= MODULES_END)) {
208 if (!warned) {
209 printk(errata93_warning);
210 warned = 1;
211 }
212 regs->rip = address;
213 return 1;
214 }
215 return 0;
216}
217
218static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
219 unsigned long error_code)
220{
221 unsigned long flags = oops_begin();
222 struct task_struct *tsk;
223
224 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
225 current->comm, address);
226 dump_pagetable(address);
227 tsk = current;
228 tsk->thread.cr2 = address;
229 tsk->thread.trap_no = 14;
230 tsk->thread.error_code = error_code;
231 __die("Bad pagetable", regs, error_code);
232 oops_end(flags);
233 do_exit(SIGKILL);
234}
235
236/*
237 * Handle a fault on the vmalloc area
238 *
239 * This assumes no large pages in there.
240 */
241static int vmalloc_fault(unsigned long address)
242{
243 pgd_t *pgd, *pgd_ref;
244 pud_t *pud, *pud_ref;
245 pmd_t *pmd, *pmd_ref;
246 pte_t *pte, *pte_ref;
247
248 /* Copy kernel mappings over when needed. This can also
249 happen within a race in page table update. In the later
250 case just flush. */
251
252 pgd = pgd_offset(current->mm ?: &init_mm, address);
253 pgd_ref = pgd_offset_k(address);
254 if (pgd_none(*pgd_ref))
255 return -1;
256 if (pgd_none(*pgd))
257 set_pgd(pgd, *pgd_ref);
258 else
259 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
260
261 /* Below here mismatches are bugs because these lower tables
262 are shared */
263
264 pud = pud_offset(pgd, address);
265 pud_ref = pud_offset(pgd_ref, address);
266 if (pud_none(*pud_ref))
267 return -1;
268 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
269 BUG();
270 pmd = pmd_offset(pud, address);
271 pmd_ref = pmd_offset(pud_ref, address);
272 if (pmd_none(*pmd_ref))
273 return -1;
274 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
275 BUG();
276 pte_ref = pte_offset_kernel(pmd_ref, address);
277 if (!pte_present(*pte_ref))
278 return -1;
279 pte = pte_offset_kernel(pmd, address);
280 /* Don't use pte_page here, because the mappings can point
281 outside mem_map, and the NUMA hash lookup cannot handle
282 that. */
283 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
284 BUG();
285 return 0;
286}
287
288int show_unhandled_signals = 1;
289
290/*
291 * This routine handles page faults. It determines the address,
292 * and the problem, and then passes it off to one of the appropriate
293 * routines.
294 */
295asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
296 unsigned long error_code)
297{
298 struct task_struct *tsk;
299 struct mm_struct *mm;
300 struct vm_area_struct * vma;
301 unsigned long address;
302 const struct exception_table_entry *fixup;
303 int write, fault;
304 unsigned long flags;
305 siginfo_t info;
306
307 /*
308 * We can fault from pretty much anywhere, with unknown IRQ state.
309 */
310 trace_hardirqs_fixup();
311
312 tsk = current;
313 mm = tsk->mm;
314 prefetchw(&mm->mmap_sem);
315
316 /* get the address */
317 address = read_cr2();
318
319 info.si_code = SEGV_MAPERR;
320
321
322 /*
323 * We fault-in kernel-space virtual memory on-demand. The
324 * 'reference' page table is init_mm.pgd.
325 *
326 * NOTE! We MUST NOT take any locks for this case. We may
327 * be in an interrupt or a critical region, and should
328 * only copy the information from the master page table,
329 * nothing more.
330 *
331 * This verifies that the fault happens in kernel space
332 * (error_code & 4) == 0, and that the fault was not a
333 * protection error (error_code & 9) == 0.
334 */
335 if (unlikely(address >= TASK_SIZE64)) {
336 /*
337 * Don't check for the module range here: its PML4
338 * is always initialized because it's shared with the main
339 * kernel text. Only vmalloc may need PML4 syncups.
340 */
341 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
342 ((address >= VMALLOC_START && address < VMALLOC_END))) {
343 if (vmalloc_fault(address) >= 0)
344 return;
345 }
346 if (notify_page_fault(regs))
347 return;
348 /*
349 * Don't take the mm semaphore here. If we fixup a prefetch
350 * fault we could otherwise deadlock.
351 */
352 goto bad_area_nosemaphore;
353 }
354
355 if (notify_page_fault(regs))
356 return;
357
358 if (likely(regs->eflags & X86_EFLAGS_IF))
359 local_irq_enable();
360
361 if (unlikely(error_code & PF_RSVD))
362 pgtable_bad(address, regs, error_code);
363
364 /*
365 * If we're in an interrupt or have no user
366 * context, we must not take the fault..
367 */
368 if (unlikely(in_atomic() || !mm))
369 goto bad_area_nosemaphore;
370
371 /*
372 * User-mode registers count as a user access even for any
373 * potential system fault or CPU buglet.
374 */
375 if (user_mode_vm(regs))
376 error_code |= PF_USER;
377
378 again:
379 /* When running in the kernel we expect faults to occur only to
380 * addresses in user space. All other faults represent errors in the
381 * kernel and should generate an OOPS. Unfortunately, in the case of an
382 * erroneous fault occurring in a code path which already holds mmap_sem
383 * we will deadlock attempting to validate the fault against the
384 * address space. Luckily the kernel only validly references user
385 * space from well defined areas of code, which are listed in the
386 * exceptions table.
387 *
388 * As the vast majority of faults will be valid we will only perform
389 * the source reference check when there is a possibility of a deadlock.
390 * Attempt to lock the address space, if we cannot we then validate the
391 * source. If this is invalid we can skip the address space check,
392 * thus avoiding the deadlock.
393 */
394 if (!down_read_trylock(&mm->mmap_sem)) {
395 if ((error_code & PF_USER) == 0 &&
396 !search_exception_tables(regs->rip))
397 goto bad_area_nosemaphore;
398 down_read(&mm->mmap_sem);
399 }
400
401 vma = find_vma(mm, address);
402 if (!vma)
403 goto bad_area;
404 if (likely(vma->vm_start <= address))
405 goto good_area;
406 if (!(vma->vm_flags & VM_GROWSDOWN))
407 goto bad_area;
408 if (error_code & 4) {
409 /* Allow userspace just enough access below the stack pointer
410 * to let the 'enter' instruction work.
411 */
412 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
413 goto bad_area;
414 }
415 if (expand_stack(vma, address))
416 goto bad_area;
417/*
418 * Ok, we have a good vm_area for this memory access, so
419 * we can handle it..
420 */
421good_area:
422 info.si_code = SEGV_ACCERR;
423 write = 0;
424 switch (error_code & (PF_PROT|PF_WRITE)) {
425 default: /* 3: write, present */
426 /* fall through */
427 case PF_WRITE: /* write, not present */
428 if (!(vma->vm_flags & VM_WRITE))
429 goto bad_area;
430 write++;
431 break;
432 case PF_PROT: /* read, present */
433 goto bad_area;
434 case 0: /* read, not present */
435 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
436 goto bad_area;
437 }
438
439 /*
440 * If for any reason at all we couldn't handle the fault,
441 * make sure we exit gracefully rather than endlessly redo
442 * the fault.
443 */
444 fault = handle_mm_fault(mm, vma, address, write);
445 if (unlikely(fault & VM_FAULT_ERROR)) {
446 if (fault & VM_FAULT_OOM)
447 goto out_of_memory;
448 else if (fault & VM_FAULT_SIGBUS)
449 goto do_sigbus;
450 BUG();
451 }
452 if (fault & VM_FAULT_MAJOR)
453 tsk->maj_flt++;
454 else
455 tsk->min_flt++;
456 up_read(&mm->mmap_sem);
457 return;
458
459/*
460 * Something tried to access memory that isn't in our memory map..
461 * Fix it, but check if it's kernel or user first..
462 */
463bad_area:
464 up_read(&mm->mmap_sem);
465
466bad_area_nosemaphore:
467 /* User mode accesses just cause a SIGSEGV */
468 if (error_code & PF_USER) {
469
470 /*
471 * It's possible to have interrupts off here.
472 */
473 local_irq_enable();
474
475 if (is_prefetch(regs, address, error_code))
476 return;
477
478 /* Work around K8 erratum #100 K8 in compat mode
479 occasionally jumps to illegal addresses >4GB. We
480 catch this here in the page fault handler because
481 these addresses are not reachable. Just detect this
482 case and return. Any code segment in LDT is
483 compatibility mode. */
484 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
485 (address >> 32))
486 return;
487
488 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
489 printk_ratelimit()) {
490 printk(
491 "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
492 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
493 tsk->comm, tsk->pid, address, regs->rip,
494 regs->rsp, error_code);
495 }
496
497 tsk->thread.cr2 = address;
498 /* Kernel addresses are always protection faults */
499 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
500 tsk->thread.trap_no = 14;
501 info.si_signo = SIGSEGV;
502 info.si_errno = 0;
503 /* info.si_code has been set above */
504 info.si_addr = (void __user *)address;
505 force_sig_info(SIGSEGV, &info, tsk);
506 return;
507 }
508
509no_context:
510
511 /* Are we prepared to handle this kernel fault? */
512 fixup = search_exception_tables(regs->rip);
513 if (fixup) {
514 regs->rip = fixup->fixup;
515 return;
516 }
517
518 /*
519 * Hall of shame of CPU/BIOS bugs.
520 */
521
522 if (is_prefetch(regs, address, error_code))
523 return;
524
525 if (is_errata93(regs, address))
526 return;
527
528/*
529 * Oops. The kernel tried to access some bad page. We'll have to
530 * terminate things with extreme prejudice.
531 */
532
533 flags = oops_begin();
534
535 if (address < PAGE_SIZE)
536 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
537 else
538 printk(KERN_ALERT "Unable to handle kernel paging request");
539 printk(" at %016lx RIP: \n" KERN_ALERT,address);
540 printk_address(regs->rip);
541 dump_pagetable(address);
542 tsk->thread.cr2 = address;
543 tsk->thread.trap_no = 14;
544 tsk->thread.error_code = error_code;
545 __die("Oops", regs, error_code);
546 /* Executive summary in case the body of the oops scrolled away */
547 printk(KERN_EMERG "CR2: %016lx\n", address);
548 oops_end(flags);
549 do_exit(SIGKILL);
550
551/*
552 * We ran out of memory, or some other thing happened to us that made
553 * us unable to handle the page fault gracefully.
554 */
555out_of_memory:
556 up_read(&mm->mmap_sem);
557 if (is_global_init(current)) {
558 yield();
559 goto again;
560 }
561 printk("VM: killing process %s\n", tsk->comm);
562 if (error_code & 4)
563 do_group_exit(SIGKILL);
564 goto no_context;
565
566do_sigbus:
567 up_read(&mm->mmap_sem);
568
569 /* Kernel mode? Handle exceptions or die */
570 if (!(error_code & PF_USER))
571 goto no_context;
572
573 tsk->thread.cr2 = address;
574 tsk->thread.error_code = error_code;
575 tsk->thread.trap_no = 14;
576 info.si_signo = SIGBUS;
577 info.si_errno = 0;
578 info.si_code = BUS_ADRERR;
579 info.si_addr = (void __user *)address;
580 force_sig_info(SIGBUS, &info, tsk);
581 return;
582}
583
584DEFINE_SPINLOCK(pgd_lock);
585LIST_HEAD(pgd_list);
586
587void vmalloc_sync_all(void)
588{
589 /* Note that races in the updates of insync and start aren't
590 problematic:
591 insync can only get set bits added, and updates to start are only
592 improving performance (without affecting correctness if undone). */
593 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
594 static unsigned long start = VMALLOC_START & PGDIR_MASK;
595 unsigned long address;
596
597 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
598 if (!test_bit(pgd_index(address), insync)) {
599 const pgd_t *pgd_ref = pgd_offset_k(address);
600 struct page *page;
601
602 if (pgd_none(*pgd_ref))
603 continue;
604 spin_lock(&pgd_lock);
605 list_for_each_entry(page, &pgd_list, lru) {
606 pgd_t *pgd;
607 pgd = (pgd_t *)page_address(page) + pgd_index(address);
608 if (pgd_none(*pgd))
609 set_pgd(pgd, *pgd_ref);
610 else
611 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
612 }
613 spin_unlock(&pgd_lock);
614 set_bit(pgd_index(address), insync);
615 }
616 if (address == start)
617 start = address + PGDIR_SIZE;
618 }
619 /* Check that there is no need to do the same for the modules area. */
620 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
621 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
622 (__START_KERNEL & PGDIR_MASK)));
623}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1c3bf95f7356..3d936f232704 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
18 kunmap_high(page); 18 kunmap_high(page);
19} 19}
20 20
21static void debug_kmap_atomic_prot(enum km_type type)
22{
23#ifdef CONFIG_DEBUG_HIGHMEM
24 static unsigned warn_count = 10;
25
26 if (unlikely(warn_count == 0))
27 return;
28
29 if (unlikely(in_interrupt())) {
30 if (in_irq()) {
31 if (type != KM_IRQ0 && type != KM_IRQ1 &&
32 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
33 type != KM_BOUNCE_READ) {
34 WARN_ON(1);
35 warn_count--;
36 }
37 } else if (!irqs_disabled()) { /* softirq */
38 if (type != KM_IRQ0 && type != KM_IRQ1 &&
39 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
40 type != KM_SKB_SUNRPC_DATA &&
41 type != KM_SKB_DATA_SOFTIRQ &&
42 type != KM_BOUNCE_READ) {
43 WARN_ON(1);
44 warn_count--;
45 }
46 }
47 }
48
49 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
50 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
51 if (!irqs_disabled()) {
52 WARN_ON(1);
53 warn_count--;
54 }
55 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
56 if (irq_count() == 0 && !irqs_disabled()) {
57 WARN_ON(1);
58 warn_count--;
59 }
60 }
61#endif
62}
63
21/* 64/*
22 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 65 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
23 * no global lock is needed and because the kmap code must perform a global TLB 66 * no global lock is needed and because the kmap code must perform a global TLB
@@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{ 73{
31 enum fixed_addresses idx; 74 enum fixed_addresses idx;
32 unsigned long vaddr; 75 unsigned long vaddr;
33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 76 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
77
78 debug_kmap_atomic_prot(type);
79
35 pagefault_disable(); 80 pagefault_disable();
36 81
37 if (!PageHighMem(page)) 82 if (!PageHighMem(page))
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6c06d9c0488e..4fbafb4bc2f0 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -15,6 +15,7 @@
15#include <asm/mman.h> 15#include <asm/mman.h>
16#include <asm/tlb.h> 16#include <asm/tlb.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/pgalloc.h>
18 19
19static unsigned long page_table_shareable(struct vm_area_struct *svma, 20static unsigned long page_table_shareable(struct vm_area_struct *svma,
20 struct vm_area_struct *vma, 21 struct vm_area_struct *vma,
@@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
88 89
89 spin_lock(&mm->page_table_lock); 90 spin_lock(&mm->page_table_lock);
90 if (pud_none(*pud)) 91 if (pud_none(*pud))
91 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); 92 pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
92 else 93 else
93 put_page(virt_to_page(spte)); 94 put_page(virt_to_page(spte));
94 spin_unlock(&mm->page_table_lock); 95 spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3c76d194fd2c..da524fb22422 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,7 +27,6 @@
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
30#include <linux/efi.h>
31#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
32#include <linux/initrd.h> 31#include <linux/initrd.h>
33#include <linux/cpumask.h> 32#include <linux/cpumask.h>
@@ -40,8 +39,10 @@
40#include <asm/fixmap.h> 39#include <asm/fixmap.h>
41#include <asm/e820.h> 40#include <asm/e820.h>
42#include <asm/apic.h> 41#include <asm/apic.h>
42#include <asm/bugs.h>
43#include <asm/tlb.h> 43#include <asm/tlb.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/pgalloc.h>
45#include <asm/sections.h> 46#include <asm/sections.h>
46#include <asm/paravirt.h> 47#include <asm/paravirt.h>
47 48
@@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 51DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51unsigned long highstart_pfn, highend_pfn; 52unsigned long highstart_pfn, highend_pfn;
52 53
53static int noinline do_test_wp_bit(void); 54static noinline int do_test_wp_bit(void);
54 55
55/* 56/*
56 * Creates a middle page table and puts a pointer to it in the 57 * Creates a middle page table and puts a pointer to it in the
@@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
61{ 62{
62 pud_t *pud; 63 pud_t *pud;
63 pmd_t *pmd_table; 64 pmd_t *pmd_table;
64 65
65#ifdef CONFIG_X86_PAE 66#ifdef CONFIG_X86_PAE
66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 67 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 68 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
68 69
69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); 70 paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 71 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
71 pud = pud_offset(pgd, 0); 72 pud = pud_offset(pgd, 0);
72 if (pmd_table != pmd_offset(pud, 0)) 73 BUG_ON(pmd_table != pmd_offset(pud, 0));
73 BUG();
74 } 74 }
75#endif 75#endif
76 pud = pud_offset(pgd, 0); 76 pud = pud_offset(pgd, 0);
77 pmd_table = pmd_offset(pud, 0); 77 pmd_table = pmd_offset(pud, 0);
78
78 return pmd_table; 79 return pmd_table;
79} 80}
80 81
81/* 82/*
82 * Create a page table and place a pointer to it in a middle page 83 * Create a page table and place a pointer to it in a middle page
83 * directory entry. 84 * directory entry:
84 */ 85 */
85static pte_t * __init one_page_table_init(pmd_t *pmd) 86static pte_t * __init one_page_table_init(pmd_t *pmd)
86{ 87{
@@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
90#ifdef CONFIG_DEBUG_PAGEALLOC 91#ifdef CONFIG_DEBUG_PAGEALLOC
91 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 92 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
92#endif 93#endif
93 if (!page_table) 94 if (!page_table) {
94 page_table = 95 page_table =
95 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 96 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
97 }
96 98
97 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); 99 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
98 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 100 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
103} 105}
104 106
105/* 107/*
106 * This function initializes a certain range of kernel virtual memory 108 * This function initializes a certain range of kernel virtual memory
107 * with new bootmem page tables, everywhere page tables are missing in 109 * with new bootmem page tables, everywhere page tables are missing in
108 * the given range. 110 * the given range.
109 */ 111 *
110 112 * NOTE: The pagetables are allocated contiguous on the physical space
111/* 113 * so we can cache the place of the first one and move around without
112 * NOTE: The pagetables are allocated contiguous on the physical space
113 * so we can cache the place of the first one and move around without
114 * checking the pgd every time. 114 * checking the pgd every time.
115 */ 115 */
116static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) 116static void __init
117page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
117{ 118{
118 pgd_t *pgd;
119 pmd_t *pmd;
120 int pgd_idx, pmd_idx; 119 int pgd_idx, pmd_idx;
121 unsigned long vaddr; 120 unsigned long vaddr;
121 pgd_t *pgd;
122 pmd_t *pmd;
122 123
123 vaddr = start; 124 vaddr = start;
124 pgd_idx = pgd_index(vaddr); 125 pgd_idx = pgd_index(vaddr);
@@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
128 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 129 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
129 pmd = one_md_table_init(pgd); 130 pmd = one_md_table_init(pgd);
130 pmd = pmd + pmd_index(vaddr); 131 pmd = pmd + pmd_index(vaddr);
131 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { 132 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
133 pmd++, pmd_idx++) {
132 one_page_table_init(pmd); 134 one_page_table_init(pmd);
133 135
134 vaddr += PMD_SIZE; 136 vaddr += PMD_SIZE;
@@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr)
145} 147}
146 148
147/* 149/*
148 * This maps the physical memory to kernel virtual address space, a total 150 * This maps the physical memory to kernel virtual address space, a total
149 * of max_low_pfn pages, by creating page tables starting from address 151 * of max_low_pfn pages, by creating page tables starting from address
150 * PAGE_OFFSET. 152 * PAGE_OFFSET:
151 */ 153 */
152static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 154static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
153{ 155{
156 int pgd_idx, pmd_idx, pte_ofs;
154 unsigned long pfn; 157 unsigned long pfn;
155 pgd_t *pgd; 158 pgd_t *pgd;
156 pmd_t *pmd; 159 pmd_t *pmd;
157 pte_t *pte; 160 pte_t *pte;
158 int pgd_idx, pmd_idx, pte_ofs;
159 161
160 pgd_idx = pgd_index(PAGE_OFFSET); 162 pgd_idx = pgd_index(PAGE_OFFSET);
161 pgd = pgd_base + pgd_idx; 163 pgd = pgd_base + pgd_idx;
@@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
165 pmd = one_md_table_init(pgd); 167 pmd = one_md_table_init(pgd);
166 if (pfn >= max_low_pfn) 168 if (pfn >= max_low_pfn)
167 continue; 169 continue;
168 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
169 unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
170 170
171 /* Map with big pages if possible, otherwise create normal page tables. */ 171 for (pmd_idx = 0;
172 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
173 pmd++, pmd_idx++) {
174 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
175
176 /*
177 * Map with big pages if possible, otherwise
178 * create normal page tables:
179 */
172 if (cpu_has_pse) { 180 if (cpu_has_pse) {
173 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; 181 unsigned int addr2;
174 if (is_kernel_text(address) || is_kernel_text(address2)) 182 pgprot_t prot = PAGE_KERNEL_LARGE;
175 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); 183
176 else 184 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
177 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); 185 PAGE_OFFSET + PAGE_SIZE-1;
186
187 if (is_kernel_text(addr) ||
188 is_kernel_text(addr2))
189 prot = PAGE_KERNEL_LARGE_EXEC;
190
191 set_pmd(pmd, pfn_pmd(pfn, prot));
178 192
179 pfn += PTRS_PER_PTE; 193 pfn += PTRS_PER_PTE;
180 } else { 194 continue;
181 pte = one_page_table_init(pmd); 195 }
182 196 pte = one_page_table_init(pmd);
183 for (pte_ofs = 0; 197
184 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 198 for (pte_ofs = 0;
185 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { 199 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
186 if (is_kernel_text(address)) 200 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
187 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); 201 pgprot_t prot = PAGE_KERNEL;
188 else 202
189 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); 203 if (is_kernel_text(addr))
190 } 204 prot = PAGE_KERNEL_EXEC;
205
206 set_pte(pte, pfn_pte(pfn, prot));
191 } 207 }
192 } 208 }
193 } 209 }
@@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr)
200 return 0; 216 return 0;
201} 217}
202 218
203int page_is_ram(unsigned long pagenr)
204{
205 int i;
206 unsigned long addr, end;
207
208 if (efi_enabled) {
209 efi_memory_desc_t *md;
210 void *p;
211
212 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
213 md = p;
214 if (!is_available_memory(md))
215 continue;
216 addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
217 end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
218
219 if ((pagenr >= addr) && (pagenr < end))
220 return 1;
221 }
222 return 0;
223 }
224
225 for (i = 0; i < e820.nr_map; i++) {
226
227 if (e820.map[i].type != E820_RAM) /* not usable memory */
228 continue;
229 /*
230 * !!!FIXME!!! Some BIOSen report areas as RAM that
231 * are not. Notably the 640->1Mb area. We need a sanity
232 * check here.
233 */
234 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
235 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
236 if ((pagenr >= addr) && (pagenr < end))
237 return 1;
238 }
239 return 0;
240}
241
242#ifdef CONFIG_HIGHMEM 219#ifdef CONFIG_HIGHMEM
243pte_t *kmap_pte; 220pte_t *kmap_pte;
244pgprot_t kmap_prot; 221pgprot_t kmap_prot;
245 222
246#define kmap_get_fixmap_pte(vaddr) \ 223static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
247 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) 224{
225 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
226 vaddr), vaddr), vaddr);
227}
248 228
249static void __init kmap_init(void) 229static void __init kmap_init(void)
250{ 230{
251 unsigned long kmap_vstart; 231 unsigned long kmap_vstart;
252 232
253 /* cache the first kmap pte */ 233 /*
234 * Cache the first kmap pte:
235 */
254 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 236 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
255 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 237 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
256 238
@@ -259,11 +241,11 @@ static void __init kmap_init(void)
259 241
260static void __init permanent_kmaps_init(pgd_t *pgd_base) 242static void __init permanent_kmaps_init(pgd_t *pgd_base)
261{ 243{
244 unsigned long vaddr;
262 pgd_t *pgd; 245 pgd_t *pgd;
263 pud_t *pud; 246 pud_t *pud;
264 pmd_t *pmd; 247 pmd_t *pmd;
265 pte_t *pte; 248 pte_t *pte;
266 unsigned long vaddr;
267 249
268 vaddr = PKMAP_BASE; 250 vaddr = PKMAP_BASE;
269 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 251 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
272 pud = pud_offset(pgd, vaddr); 254 pud = pud_offset(pgd, vaddr);
273 pmd = pmd_offset(pud, vaddr); 255 pmd = pmd_offset(pud, vaddr);
274 pte = pte_offset_kernel(pmd, vaddr); 256 pte = pte_offset_kernel(pmd, vaddr);
275 pkmap_page_table = pte; 257 pkmap_page_table = pte;
276} 258}
277 259
278static void __meminit free_new_highpage(struct page *page) 260static void __meminit free_new_highpage(struct page *page)
@@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
291 SetPageReserved(page); 273 SetPageReserved(page);
292} 274}
293 275
294static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) 276static int __meminit
277add_one_highpage_hotplug(struct page *page, unsigned long pfn)
295{ 278{
296 free_new_highpage(page); 279 free_new_highpage(page);
297 totalram_pages++; 280 totalram_pages++;
@@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
299 max_mapnr = max(pfn, max_mapnr); 282 max_mapnr = max(pfn, max_mapnr);
300#endif 283#endif
301 num_physpages++; 284 num_physpages++;
285
302 return 0; 286 return 0;
303} 287}
304 288
@@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
306 * Not currently handling the NUMA case. 290 * Not currently handling the NUMA case.
307 * Assuming single node and all memory that 291 * Assuming single node and all memory that
308 * has been added dynamically that would be 292 * has been added dynamically that would be
309 * onlined here is in HIGHMEM 293 * onlined here is in HIGHMEM.
310 */ 294 */
311void __meminit online_page(struct page *page) 295void __meminit online_page(struct page *page)
312{ 296{
@@ -314,13 +298,11 @@ void __meminit online_page(struct page *page)
314 add_one_highpage_hotplug(page, page_to_pfn(page)); 298 add_one_highpage_hotplug(page, page_to_pfn(page));
315} 299}
316 300
317 301#ifndef CONFIG_NUMA
318#ifdef CONFIG_NUMA
319extern void set_highmem_pages_init(int);
320#else
321static void __init set_highmem_pages_init(int bad_ppro) 302static void __init set_highmem_pages_init(int bad_ppro)
322{ 303{
323 int pfn; 304 int pfn;
305
324 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
325 /* 307 /*
326 * Holes under sparsemem might not have no mem_map[]: 308 * Holes under sparsemem might not have no mem_map[]:
@@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro)
330 } 312 }
331 totalram_pages += totalhigh_pages; 313 totalram_pages += totalhigh_pages;
332} 314}
333#endif /* CONFIG_FLATMEM */ 315#endif /* !CONFIG_NUMA */
334 316
335#else 317#else
336#define kmap_init() do { } while (0) 318# define kmap_init() do { } while (0)
337#define permanent_kmaps_init(pgd_base) do { } while (0) 319# define permanent_kmaps_init(pgd_base) do { } while (0)
338#define set_highmem_pages_init(bad_ppro) do { } while (0) 320# define set_highmem_pages_init(bad_ppro) do { } while (0)
339#endif /* CONFIG_HIGHMEM */ 321#endif /* CONFIG_HIGHMEM */
340 322
341unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; 323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
342EXPORT_SYMBOL(__PAGE_KERNEL); 324EXPORT_SYMBOL(__PAGE_KERNEL);
343unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
344 325
345#ifdef CONFIG_NUMA 326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
346extern void __init remap_numa_kva(void);
347#else
348#define remap_numa_kva() do {} while (0)
349#endif
350 327
351void __init native_pagetable_setup_start(pgd_t *base) 328void __init native_pagetable_setup_start(pgd_t *base)
352{ 329{
@@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
372 memset(&base[USER_PTRS_PER_PGD], 0, 349 memset(&base[USER_PTRS_PER_PGD], 0,
373 KERNEL_PGD_PTRS * sizeof(pgd_t)); 350 KERNEL_PGD_PTRS * sizeof(pgd_t));
374#else 351#else
375 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); 352 paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
376#endif 353#endif
377} 354}
378 355
@@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
410 * be partially populated, and so it avoids stomping on any existing 387 * be partially populated, and so it avoids stomping on any existing
411 * mappings. 388 * mappings.
412 */ 389 */
413static void __init pagetable_init (void) 390static void __init pagetable_init(void)
414{ 391{
415 unsigned long vaddr, end;
416 pgd_t *pgd_base = swapper_pg_dir; 392 pgd_t *pgd_base = swapper_pg_dir;
393 unsigned long vaddr, end;
417 394
418 paravirt_pagetable_setup_start(pgd_base); 395 paravirt_pagetable_setup_start(pgd_base);
419 396
@@ -435,9 +412,11 @@ static void __init pagetable_init (void)
435 * Fixed mappings, only the page table structure has to be 412 * Fixed mappings, only the page table structure has to be
436 * created - mappings will be set by set_fixmap(): 413 * created - mappings will be set by set_fixmap():
437 */ 414 */
415 early_ioremap_clear();
438 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 416 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
439 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 417 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
440 page_table_range_init(vaddr, end, pgd_base); 418 page_table_range_init(vaddr, end, pgd_base);
419 early_ioremap_reset();
441 420
442 permanent_kmaps_init(pgd_base); 421 permanent_kmaps_init(pgd_base);
443 422
@@ -450,7 +429,7 @@ static void __init pagetable_init (void)
450 * driver might have split up a kernel 4MB mapping. 429 * driver might have split up a kernel 4MB mapping.
451 */ 430 */
452char __nosavedata swsusp_pg_dir[PAGE_SIZE] 431char __nosavedata swsusp_pg_dir[PAGE_SIZE]
453 __attribute__ ((aligned (PAGE_SIZE))); 432 __attribute__ ((aligned(PAGE_SIZE)));
454 433
455static inline void save_pg_dir(void) 434static inline void save_pg_dir(void)
456{ 435{
@@ -462,7 +441,7 @@ static inline void save_pg_dir(void)
462} 441}
463#endif 442#endif
464 443
465void zap_low_mappings (void) 444void zap_low_mappings(void)
466{ 445{
467 int i; 446 int i;
468 447
@@ -474,22 +453,24 @@ void zap_low_mappings (void)
474 * Note that "pgd_clear()" doesn't do it for 453 * Note that "pgd_clear()" doesn't do it for
475 * us, because pgd_clear() is a no-op on i386. 454 * us, because pgd_clear() is a no-op on i386.
476 */ 455 */
477 for (i = 0; i < USER_PTRS_PER_PGD; i++) 456 for (i = 0; i < USER_PTRS_PER_PGD; i++) {
478#ifdef CONFIG_X86_PAE 457#ifdef CONFIG_X86_PAE
479 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 458 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
480#else 459#else
481 set_pgd(swapper_pg_dir+i, __pgd(0)); 460 set_pgd(swapper_pg_dir+i, __pgd(0));
482#endif 461#endif
462 }
483 flush_tlb_all(); 463 flush_tlb_all();
484} 464}
485 465
486int nx_enabled = 0; 466int nx_enabled;
467
468pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
469EXPORT_SYMBOL_GPL(__supported_pte_mask);
487 470
488#ifdef CONFIG_X86_PAE 471#ifdef CONFIG_X86_PAE
489 472
490static int disable_nx __initdata = 0; 473static int disable_nx __initdata;
491u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
492EXPORT_SYMBOL_GPL(__supported_pte_mask);
493 474
494/* 475/*
495 * noexec = on|off 476 * noexec = on|off
@@ -506,11 +487,14 @@ static int __init noexec_setup(char *str)
506 __supported_pte_mask |= _PAGE_NX; 487 __supported_pte_mask |= _PAGE_NX;
507 disable_nx = 0; 488 disable_nx = 0;
508 } 489 }
509 } else if (!strcmp(str,"off")) { 490 } else {
510 disable_nx = 1; 491 if (!strcmp(str, "off")) {
511 __supported_pte_mask &= ~_PAGE_NX; 492 disable_nx = 1;
512 } else 493 __supported_pte_mask &= ~_PAGE_NX;
513 return -EINVAL; 494 } else {
495 return -EINVAL;
496 }
497 }
514 498
515 return 0; 499 return 0;
516} 500}
@@ -522,6 +506,7 @@ static void __init set_nx(void)
522 506
523 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 507 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
524 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 508 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
509
525 if ((v[3] & (1 << 20)) && !disable_nx) { 510 if ((v[3] & (1 << 20)) && !disable_nx) {
526 rdmsr(MSR_EFER, l, h); 511 rdmsr(MSR_EFER, l, h);
527 l |= EFER_NX; 512 l |= EFER_NX;
@@ -531,35 +516,6 @@ static void __init set_nx(void)
531 } 516 }
532 } 517 }
533} 518}
534
535/*
536 * Enables/disables executability of a given kernel page and
537 * returns the previous setting.
538 */
539int __init set_kernel_exec(unsigned long vaddr, int enable)
540{
541 pte_t *pte;
542 int ret = 1;
543
544 if (!nx_enabled)
545 goto out;
546
547 pte = lookup_address(vaddr);
548 BUG_ON(!pte);
549
550 if (!pte_exec_kernel(*pte))
551 ret = 0;
552
553 if (enable)
554 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
555 else
556 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
557 pte_update_defer(&init_mm, vaddr, pte);
558 __flush_tlb_all();
559out:
560 return ret;
561}
562
563#endif 519#endif
564 520
565/* 521/*
@@ -574,9 +530,8 @@ void __init paging_init(void)
574#ifdef CONFIG_X86_PAE 530#ifdef CONFIG_X86_PAE
575 set_nx(); 531 set_nx();
576 if (nx_enabled) 532 if (nx_enabled)
577 printk("NX (Execute Disable) protection: active\n"); 533 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
578#endif 534#endif
579
580 pagetable_init(); 535 pagetable_init();
581 536
582 load_cr3(swapper_pg_dir); 537 load_cr3(swapper_pg_dir);
@@ -600,10 +555,10 @@ void __init paging_init(void)
600 * used to involve black magic jumps to work around some nasty CPU bugs, 555 * used to involve black magic jumps to work around some nasty CPU bugs,
601 * but fortunately the switch to using exceptions got rid of all that. 556 * but fortunately the switch to using exceptions got rid of all that.
602 */ 557 */
603
604static void __init test_wp_bit(void) 558static void __init test_wp_bit(void)
605{ 559{
606 printk("Checking if this processor honours the WP bit even in supervisor mode... "); 560 printk(KERN_INFO
561 "Checking if this processor honours the WP bit even in supervisor mode...");
607 562
608 /* Any page-aligned address will do, the test is non-destructive */ 563 /* Any page-aligned address will do, the test is non-destructive */
609 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); 564 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -611,47 +566,46 @@ static void __init test_wp_bit(void)
611 clear_fixmap(FIX_WP_TEST); 566 clear_fixmap(FIX_WP_TEST);
612 567
613 if (!boot_cpu_data.wp_works_ok) { 568 if (!boot_cpu_data.wp_works_ok) {
614 printk("No.\n"); 569 printk(KERN_CONT "No.\n");
615#ifdef CONFIG_X86_WP_WORKS_OK 570#ifdef CONFIG_X86_WP_WORKS_OK
616 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); 571 panic(
572 "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
617#endif 573#endif
618 } else { 574 } else {
619 printk("Ok.\n"); 575 printk(KERN_CONT "Ok.\n");
620 } 576 }
621} 577}
622 578
623static struct kcore_list kcore_mem, kcore_vmalloc; 579static struct kcore_list kcore_mem, kcore_vmalloc;
624 580
625void __init mem_init(void) 581void __init mem_init(void)
626{ 582{
627 extern int ppro_with_ram_bug(void);
628 int codesize, reservedpages, datasize, initsize; 583 int codesize, reservedpages, datasize, initsize;
629 int tmp; 584 int tmp, bad_ppro;
630 int bad_ppro;
631 585
632#ifdef CONFIG_FLATMEM 586#ifdef CONFIG_FLATMEM
633 BUG_ON(!mem_map); 587 BUG_ON(!mem_map);
634#endif 588#endif
635
636 bad_ppro = ppro_with_ram_bug(); 589 bad_ppro = ppro_with_ram_bug();
637 590
638#ifdef CONFIG_HIGHMEM 591#ifdef CONFIG_HIGHMEM
639 /* check that fixmap and pkmap do not overlap */ 592 /* check that fixmap and pkmap do not overlap */
640 if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { 593 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
641 printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); 594 printk(KERN_ERR
595 "fixmap and kmap areas overlap - this will crash\n");
642 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", 596 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
643 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); 597 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
598 FIXADDR_START);
644 BUG(); 599 BUG();
645 } 600 }
646#endif 601#endif
647
648 /* this will put all low memory onto the freelists */ 602 /* this will put all low memory onto the freelists */
649 totalram_pages += free_all_bootmem(); 603 totalram_pages += free_all_bootmem();
650 604
651 reservedpages = 0; 605 reservedpages = 0;
652 for (tmp = 0; tmp < max_low_pfn; tmp++) 606 for (tmp = 0; tmp < max_low_pfn; tmp++)
653 /* 607 /*
654 * Only count reserved RAM pages 608 * Only count reserved RAM pages:
655 */ 609 */
656 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 610 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
657 reservedpages++; 611 reservedpages++;
@@ -662,11 +616,12 @@ void __init mem_init(void)
662 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 616 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
663 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 617 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
664 618
665 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 619 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
666 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 620 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
667 VMALLOC_END-VMALLOC_START); 621 VMALLOC_END-VMALLOC_START);
668 622
669 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", 623 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
624 "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
670 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 625 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
671 num_physpages << (PAGE_SHIFT-10), 626 num_physpages << (PAGE_SHIFT-10),
672 codesize >> 10, 627 codesize >> 10,
@@ -677,45 +632,46 @@ void __init mem_init(void)
677 ); 632 );
678 633
679#if 1 /* double-sanity-check paranoia */ 634#if 1 /* double-sanity-check paranoia */
680 printk("virtual kernel memory layout:\n" 635 printk(KERN_INFO "virtual kernel memory layout:\n"
681 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 636 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
682#ifdef CONFIG_HIGHMEM 637#ifdef CONFIG_HIGHMEM
683 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 638 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
684#endif 639#endif
685 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" 640 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
686 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" 641 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
687 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" 642 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
688 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" 643 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
689 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", 644 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
690 FIXADDR_START, FIXADDR_TOP, 645 FIXADDR_START, FIXADDR_TOP,
691 (FIXADDR_TOP - FIXADDR_START) >> 10, 646 (FIXADDR_TOP - FIXADDR_START) >> 10,
692 647
693#ifdef CONFIG_HIGHMEM 648#ifdef CONFIG_HIGHMEM
694 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 649 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
695 (LAST_PKMAP*PAGE_SIZE) >> 10, 650 (LAST_PKMAP*PAGE_SIZE) >> 10,
696#endif 651#endif
697 652
698 VMALLOC_START, VMALLOC_END, 653 VMALLOC_START, VMALLOC_END,
699 (VMALLOC_END - VMALLOC_START) >> 20, 654 (VMALLOC_END - VMALLOC_START) >> 20,
700 655
701 (unsigned long)__va(0), (unsigned long)high_memory, 656 (unsigned long)__va(0), (unsigned long)high_memory,
702 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 657 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
703 658
704 (unsigned long)&__init_begin, (unsigned long)&__init_end, 659 (unsigned long)&__init_begin, (unsigned long)&__init_end,
705 ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, 660 ((unsigned long)&__init_end -
661 (unsigned long)&__init_begin) >> 10,
706 662
707 (unsigned long)&_etext, (unsigned long)&_edata, 663 (unsigned long)&_etext, (unsigned long)&_edata,
708 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 664 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
709 665
710 (unsigned long)&_text, (unsigned long)&_etext, 666 (unsigned long)&_text, (unsigned long)&_etext,
711 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 667 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
712 668
713#ifdef CONFIG_HIGHMEM 669#ifdef CONFIG_HIGHMEM
714 BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 670 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
715 BUG_ON(VMALLOC_END > PKMAP_BASE); 671 BUG_ON(VMALLOC_END > PKMAP_BASE);
716#endif 672#endif
717 BUG_ON(VMALLOC_START > VMALLOC_END); 673 BUG_ON(VMALLOC_START > VMALLOC_END);
718 BUG_ON((unsigned long)high_memory > VMALLOC_START); 674 BUG_ON((unsigned long)high_memory > VMALLOC_START);
719#endif /* double-sanity-check paranoia */ 675#endif /* double-sanity-check paranoia */
720 676
721#ifdef CONFIG_X86_PAE 677#ifdef CONFIG_X86_PAE
@@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size)
746 702
747 return __add_pages(zone, start_pfn, nr_pages); 703 return __add_pages(zone, start_pfn, nr_pages);
748} 704}
749
750#endif 705#endif
751 706
752struct kmem_cache *pmd_cache;
753
754void __init pgtable_cache_init(void)
755{
756 if (PTRS_PER_PMD > 1)
757 pmd_cache = kmem_cache_create("pmd",
758 PTRS_PER_PMD*sizeof(pmd_t),
759 PTRS_PER_PMD*sizeof(pmd_t),
760 SLAB_PANIC,
761 pmd_ctor);
762}
763
764/* 707/*
765 * This function cannot be __init, since exceptions don't work in that 708 * This function cannot be __init, since exceptions don't work in that
766 * section. Put this after the callers, so that it cannot be inlined. 709 * section. Put this after the callers, so that it cannot be inlined.
767 */ 710 */
768static int noinline do_test_wp_bit(void) 711static noinline int do_test_wp_bit(void)
769{ 712{
770 char tmp_reg; 713 char tmp_reg;
771 int flag; 714 int flag;
772 715
773 __asm__ __volatile__( 716 __asm__ __volatile__(
774 " movb %0,%1 \n" 717 " movb %0, %1 \n"
775 "1: movb %1,%0 \n" 718 "1: movb %1, %0 \n"
776 " xorl %2,%2 \n" 719 " xorl %2, %2 \n"
777 "2: \n" 720 "2: \n"
778 ".section __ex_table,\"a\"\n" 721 ".section __ex_table, \"a\"\n"
779 " .align 4 \n" 722 " .align 4 \n"
780 " .long 1b,2b \n" 723 " .long 1b, 2b \n"
781 ".previous \n" 724 ".previous \n"
782 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), 725 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
783 "=q" (tmp_reg), 726 "=q" (tmp_reg),
784 "=r" (flag) 727 "=r" (flag)
785 :"2" (1) 728 :"2" (1)
786 :"memory"); 729 :"memory");
787 730
788 return flag; 731 return flag;
789} 732}
790 733
791#ifdef CONFIG_DEBUG_RODATA 734#ifdef CONFIG_DEBUG_RODATA
735const int rodata_test_data = 0xC3;
736EXPORT_SYMBOL_GPL(rodata_test_data);
792 737
793void mark_rodata_ro(void) 738void mark_rodata_ro(void)
794{ 739{
@@ -801,32 +746,58 @@ void mark_rodata_ro(void)
801 if (num_possible_cpus() <= 1) 746 if (num_possible_cpus() <= 1)
802#endif 747#endif
803 { 748 {
804 change_page_attr(virt_to_page(start), 749 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
805 size >> PAGE_SHIFT, PAGE_KERNEL_RX); 750 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
806 printk("Write protecting the kernel text: %luk\n", size >> 10); 751 size >> 10);
752
753#ifdef CONFIG_CPA_DEBUG
754 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
755 start, start+size);
756 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
757
758 printk(KERN_INFO "Testing CPA: write protecting again\n");
759 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
760#endif
807 } 761 }
808#endif 762#endif
809 start += size; 763 start += size;
810 size = (unsigned long)__end_rodata - start; 764 size = (unsigned long)__end_rodata - start;
811 change_page_attr(virt_to_page(start), 765 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
812 size >> PAGE_SHIFT, PAGE_KERNEL_RO); 766 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
813 printk("Write protecting the kernel read-only data: %luk\n", 767 size >> 10);
814 size >> 10); 768 rodata_test();
815 769
816 /* 770#ifdef CONFIG_CPA_DEBUG
817 * change_page_attr() requires a global_flush_tlb() call after it. 771 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
818 * We do this after the printk so that if something went wrong in the 772 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
819 * change, the printk gets out at least to give a better debug hint 773
820 * of who is the culprit. 774 printk(KERN_INFO "Testing CPA: write protecting again\n");
821 */ 775 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
822 global_flush_tlb(); 776#endif
823} 777}
824#endif 778#endif
825 779
826void free_init_pages(char *what, unsigned long begin, unsigned long end) 780void free_init_pages(char *what, unsigned long begin, unsigned long end)
827{ 781{
782#ifdef CONFIG_DEBUG_PAGEALLOC
783 /*
784 * If debugging page accesses then do not free this memory but
785 * mark them not present - any buggy init-section access will
786 * create a kernel page fault:
787 */
788 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
789 begin, PAGE_ALIGN(end));
790 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
791#else
828 unsigned long addr; 792 unsigned long addr;
829 793
794 /*
795 * We just marked the kernel text read only above, now that
796 * we are going to free part of that, we need to make that
797 * writeable first.
798 */
799 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
800
830 for (addr = begin; addr < end; addr += PAGE_SIZE) { 801 for (addr = begin; addr < end; addr += PAGE_SIZE) {
831 ClearPageReserved(virt_to_page(addr)); 802 ClearPageReserved(virt_to_page(addr));
832 init_page_count(virt_to_page(addr)); 803 init_page_count(virt_to_page(addr));
@@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
835 totalram_pages++; 806 totalram_pages++;
836 } 807 }
837 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 808 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
809#endif
838} 810}
839 811
840void free_initmem(void) 812void free_initmem(void)
@@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
850 free_init_pages("initrd memory", start, end); 822 free_init_pages("initrd memory", start, end);
851} 823}
852#endif 824#endif
853
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f9c8c890658..cc50a13ce8d9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,12 +43,10 @@
43#include <asm/proto.h> 43#include <asm/proto.h>
44#include <asm/smp.h> 44#include <asm/smp.h>
45#include <asm/sections.h> 45#include <asm/sections.h>
46#include <asm/kdebug.h>
47#include <asm/numa.h>
46 48
47#ifndef Dprintk 49const struct dma_mapping_ops *dma_ops;
48#define Dprintk(x...)
49#endif
50
51const struct dma_mapping_ops* dma_ops;
52EXPORT_SYMBOL(dma_ops); 50EXPORT_SYMBOL(dma_ops);
53 51
54static unsigned long dma_reserve __initdata; 52static unsigned long dma_reserve __initdata;
@@ -65,22 +63,26 @@ void show_mem(void)
65{ 63{
66 long i, total = 0, reserved = 0; 64 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0; 65 long shared = 0, cached = 0;
68 pg_data_t *pgdat;
69 struct page *page; 66 struct page *page;
67 pg_data_t *pgdat;
70 68
71 printk(KERN_INFO "Mem-info:\n"); 69 printk(KERN_INFO "Mem-info:\n");
72 show_free_areas(); 70 show_free_areas();
73 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 71 printk(KERN_INFO "Free swap: %6ldkB\n",
72 nr_swap_pages << (PAGE_SHIFT-10));
74 73
75 for_each_online_pgdat(pgdat) { 74 for_each_online_pgdat(pgdat) {
76 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 75 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages 76 /*
78 so update the NMI watchdog */ 77 * This loop can take a while with 256 GB and
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { 78 * 4k pages so defer the NMI watchdog:
79 */
80 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
80 touch_nmi_watchdog(); 81 touch_nmi_watchdog();
81 } 82
82 if (!pfn_valid(pgdat->node_start_pfn + i)) 83 if (!pfn_valid(pgdat->node_start_pfn + i))
83 continue; 84 continue;
85
84 page = pfn_to_page(pgdat->node_start_pfn + i); 86 page = pfn_to_page(pgdat->node_start_pfn + i);
85 total++; 87 total++;
86 if (PageReserved(page)) 88 if (PageReserved(page))
@@ -89,51 +91,58 @@ void show_mem(void)
89 cached++; 91 cached++;
90 else if (page_count(page)) 92 else if (page_count(page))
91 shared += page_count(page) - 1; 93 shared += page_count(page) - 1;
92 } 94 }
93 } 95 }
94 printk(KERN_INFO "%lu pages of RAM\n", total); 96 printk(KERN_INFO "%lu pages of RAM\n", total);
95 printk(KERN_INFO "%lu reserved pages\n",reserved); 97 printk(KERN_INFO "%lu reserved pages\n", reserved);
96 printk(KERN_INFO "%lu pages shared\n",shared); 98 printk(KERN_INFO "%lu pages shared\n", shared);
97 printk(KERN_INFO "%lu pages swap cached\n",cached); 99 printk(KERN_INFO "%lu pages swap cached\n", cached);
98} 100}
99 101
100int after_bootmem; 102int after_bootmem;
101 103
102static __init void *spp_getpage(void) 104static __init void *spp_getpage(void)
103{ 105{
104 void *ptr; 106 void *ptr;
107
105 if (after_bootmem) 108 if (after_bootmem)
106 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 109 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
107 else 110 else
108 ptr = alloc_bootmem_pages(PAGE_SIZE); 111 ptr = alloc_bootmem_pages(PAGE_SIZE);
109 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111 112
112 Dprintk("spp_getpage %p\n", ptr); 113 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
114 panic("set_pte_phys: cannot allocate page data %s\n",
115 after_bootmem ? "after bootmem" : "");
116 }
117
118 pr_debug("spp_getpage %p\n", ptr);
119
113 return ptr; 120 return ptr;
114} 121}
115 122
116static __init void set_pte_phys(unsigned long vaddr, 123static __init void
117 unsigned long phys, pgprot_t prot) 124set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
118{ 125{
119 pgd_t *pgd; 126 pgd_t *pgd;
120 pud_t *pud; 127 pud_t *pud;
121 pmd_t *pmd; 128 pmd_t *pmd;
122 pte_t *pte, new_pte; 129 pte_t *pte, new_pte;
123 130
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); 131 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
125 132
126 pgd = pgd_offset_k(vaddr); 133 pgd = pgd_offset_k(vaddr);
127 if (pgd_none(*pgd)) { 134 if (pgd_none(*pgd)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); 135 printk(KERN_ERR
136 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 return; 137 return;
130 } 138 }
131 pud = pud_offset(pgd, vaddr); 139 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) { 140 if (pud_none(*pud)) {
133 pmd = (pmd_t *) spp_getpage(); 141 pmd = (pmd_t *) spp_getpage();
134 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 142 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135 if (pmd != pmd_offset(pud, 0)) { 143 if (pmd != pmd_offset(pud, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); 144 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
145 pmd, pmd_offset(pud, 0));
137 return; 146 return;
138 } 147 }
139 } 148 }
@@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr,
142 pte = (pte_t *) spp_getpage(); 151 pte = (pte_t *) spp_getpage();
143 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 152 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144 if (pte != pte_offset_kernel(pmd, 0)) { 153 if (pte != pte_offset_kernel(pmd, 0)) {
145 printk("PAGETABLE BUG #02!\n"); 154 printk(KERN_ERR "PAGETABLE BUG #02!\n");
146 return; 155 return;
147 } 156 }
148 } 157 }
@@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr,
162} 171}
163 172
164/* NOTE: this is meant to be run only at boot */ 173/* NOTE: this is meant to be run only at boot */
165void __init 174void __init
166__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) 175__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167{ 176{
168 unsigned long address = __fix_to_virt(idx); 177 unsigned long address = __fix_to_virt(idx);
169 178
170 if (idx >= __end_of_fixed_addresses) { 179 if (idx >= __end_of_fixed_addresses) {
171 printk("Invalid __set_fixmap\n"); 180 printk(KERN_ERR "Invalid __set_fixmap\n");
172 return; 181 return;
173 } 182 }
174 set_pte_phys(address, phys, prot); 183 set_pte_phys(address, phys, prot);
175} 184}
176 185
177unsigned long __meminitdata table_start, table_end; 186static unsigned long __initdata table_start;
187static unsigned long __meminitdata table_end;
178 188
179static __meminit void *alloc_low_page(unsigned long *phys) 189static __meminit void *alloc_low_page(unsigned long *phys)
180{ 190{
181 unsigned long pfn = table_end++; 191 unsigned long pfn = table_end++;
182 void *adr; 192 void *adr;
183 193
184 if (after_bootmem) { 194 if (after_bootmem) {
185 adr = (void *)get_zeroed_page(GFP_ATOMIC); 195 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186 *phys = __pa(adr); 196 *phys = __pa(adr);
197
187 return adr; 198 return adr;
188 } 199 }
189 200
190 if (pfn >= end_pfn) 201 if (pfn >= end_pfn)
191 panic("alloc_low_page: ran out of memory"); 202 panic("alloc_low_page: ran out of memory");
192 203
193 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 204 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194 memset(adr, 0, PAGE_SIZE); 205 memset(adr, 0, PAGE_SIZE);
@@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys)
197} 208}
198 209
199static __meminit void unmap_low_page(void *adr) 210static __meminit void unmap_low_page(void *adr)
200{ 211{
201
202 if (after_bootmem) 212 if (after_bootmem)
203 return; 213 return;
204 214
205 early_iounmap(adr, PAGE_SIZE); 215 early_iounmap(adr, PAGE_SIZE);
206} 216}
207 217
208/* Must run before zap_low_mappings */ 218/* Must run before zap_low_mappings */
209__meminit void *early_ioremap(unsigned long addr, unsigned long size) 219__meminit void *early_ioremap(unsigned long addr, unsigned long size)
210{ 220{
211 unsigned long vaddr;
212 pmd_t *pmd, *last_pmd; 221 pmd_t *pmd, *last_pmd;
222 unsigned long vaddr;
213 int i, pmds; 223 int i, pmds;
214 224
215 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 225 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216 vaddr = __START_KERNEL_map; 226 vaddr = __START_KERNEL_map;
217 pmd = level2_kernel_pgt; 227 pmd = level2_kernel_pgt;
218 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; 228 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
229
219 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 230 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220 for (i = 0; i < pmds; i++) { 231 for (i = 0; i < pmds; i++) {
221 if (pmd_present(pmd[i])) 232 if (pmd_present(pmd[i]))
222 goto next; 233 goto continue_outer_loop;
223 } 234 }
224 vaddr += addr & ~PMD_MASK; 235 vaddr += addr & ~PMD_MASK;
225 addr &= PMD_MASK; 236 addr &= PMD_MASK;
237
226 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 238 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); 239 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
228 __flush_tlb(); 240 __flush_tlb_all();
241
229 return (void *)vaddr; 242 return (void *)vaddr;
230 next: 243continue_outer_loop:
231 ; 244 ;
232 } 245 }
233 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); 246 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
247
234 return NULL; 248 return NULL;
235} 249}
236 250
237/* To avoid virtual aliases later */ 251/*
252 * To avoid virtual aliases later:
253 */
238__meminit void early_iounmap(void *addr, unsigned long size) 254__meminit void early_iounmap(void *addr, unsigned long size)
239{ 255{
240 unsigned long vaddr; 256 unsigned long vaddr;
@@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size)
244 vaddr = (unsigned long)addr; 260 vaddr = (unsigned long)addr;
245 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 261 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246 pmd = level2_kernel_pgt + pmd_index(vaddr); 262 pmd = level2_kernel_pgt + pmd_index(vaddr);
263
247 for (i = 0; i < pmds; i++) 264 for (i = 0; i < pmds; i++)
248 pmd_clear(pmd + i); 265 pmd_clear(pmd + i);
249 __flush_tlb(); 266
267 __flush_tlb_all();
250} 268}
251 269
252static void __meminit 270static void __meminit
@@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
259 pmd_t *pmd = pmd_page + pmd_index(address); 277 pmd_t *pmd = pmd_page + pmd_index(address);
260 278
261 if (address >= end) { 279 if (address >= end) {
262 if (!after_bootmem) 280 if (!after_bootmem) {
263 for (; i < PTRS_PER_PMD; i++, pmd++) 281 for (; i < PTRS_PER_PMD; i++, pmd++)
264 set_pmd(pmd, __pmd(0)); 282 set_pmd(pmd, __pmd(0));
283 }
265 break; 284 break;
266 } 285 }
267 286
268 if (pmd_val(*pmd)) 287 if (pmd_val(*pmd))
269 continue; 288 continue;
270 289
271 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; 290 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
272 entry &= __supported_pte_mask; 291 entry &= __supported_pte_mask;
273 set_pmd(pmd, __pmd(entry)); 292 set_pmd(pmd, __pmd(entry));
274 } 293 }
@@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
277static void __meminit 296static void __meminit
278phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 297phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279{ 298{
280 pmd_t *pmd = pmd_offset(pud,0); 299 pmd_t *pmd = pmd_offset(pud, 0);
281 spin_lock(&init_mm.page_table_lock); 300 spin_lock(&init_mm.page_table_lock);
282 phys_pmd_init(pmd, address, end); 301 phys_pmd_init(pmd, address, end);
283 spin_unlock(&init_mm.page_table_lock); 302 spin_unlock(&init_mm.page_table_lock);
284 __flush_tlb_all(); 303 __flush_tlb_all();
285} 304}
286 305
287static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 306static void __meminit
288{ 307phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
308{
289 int i = pud_index(addr); 309 int i = pud_index(addr);
290 310
291 311 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
292 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
293 unsigned long pmd_phys; 312 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr); 313 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd; 314 pmd_t *pmd;
@@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
297 if (addr >= end) 316 if (addr >= end)
298 break; 317 break;
299 318
300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { 319 if (!after_bootmem &&
301 set_pud(pud, __pud(0)); 320 !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
321 set_pud(pud, __pud(0));
302 continue; 322 continue;
303 } 323 }
304 324
305 if (pud_val(*pud)) { 325 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end); 326 phys_pmd_update(pud, addr, end);
@@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
308 } 328 }
309 329
310 pmd = alloc_low_page(&pmd_phys); 330 pmd = alloc_low_page(&pmd_phys);
331
311 spin_lock(&init_mm.page_table_lock); 332 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 333 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end); 334 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock); 335 spin_unlock(&init_mm.page_table_lock);
336
315 unmap_low_page(pmd); 337 unmap_low_page(pmd);
316 } 338 }
317 __flush_tlb(); 339 __flush_tlb_all();
318} 340}
319 341
320static void __init find_early_table_space(unsigned long end) 342static void __init find_early_table_space(unsigned long end)
321{ 343{
@@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end)
326 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + 348 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327 round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 349 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328 350
329 /* RED-PEN putting page tables only on node 0 could 351 /*
330 cause a hotspot and fill up ZONE_DMA. The page tables 352 * RED-PEN putting page tables only on node 0 could
331 need roughly 0.5KB per GB. */ 353 * cause a hotspot and fill up ZONE_DMA. The page tables
332 start = 0x8000; 354 * need roughly 0.5KB per GB.
333 table_start = find_e820_area(start, end, tables); 355 */
356 start = 0x8000;
357 table_start = find_e820_area(start, end, tables);
334 if (table_start == -1UL) 358 if (table_start == -1UL)
335 panic("Cannot find space for the kernel page tables"); 359 panic("Cannot find space for the kernel page tables");
336 360
361 /*
362 * When you have a lot of RAM like 256GB, early_table will not fit
363 * into 0x8000 range, find_e820_area() will find area after kernel
364 * bss but the table_start is not page aligned, so need to round it
365 * up to avoid overlap with bss:
366 */
367 table_start = round_up(table_start, PAGE_SIZE);
337 table_start >>= PAGE_SHIFT; 368 table_start >>= PAGE_SHIFT;
338 table_end = table_start; 369 table_end = table_start;
339 370
@@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end)
342 (table_start << PAGE_SHIFT) + tables); 373 (table_start << PAGE_SHIFT) + tables);
343} 374}
344 375
345/* Setup the direct mapping of the physical memory at PAGE_OFFSET. 376/*
346 This runs before bootmem is initialized and gets pages directly from the 377 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
347 physical memory. To access them they are temporarily mapped. */ 378 * This runs before bootmem is initialized and gets pages directly from
379 * the physical memory. To access them they are temporarily mapped.
380 */
348void __init_refok init_memory_mapping(unsigned long start, unsigned long end) 381void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
349{ 382{
350 unsigned long next; 383 unsigned long next;
351 384
352 Dprintk("init_memory_mapping\n"); 385 pr_debug("init_memory_mapping\n");
353 386
354 /* 387 /*
355 * Find space for the kernel direct mapping tables. 388 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory 389 *
357 * mapped. Unfortunately this is done currently before the nodes are 390 * Later we should allocate these tables in the local node of the
358 * discovered. 391 * memory mapped. Unfortunately this is done currently before the
392 * nodes are discovered.
359 */ 393 */
360 if (!after_bootmem) 394 if (!after_bootmem)
361 find_early_table_space(end); 395 find_early_table_space(end);
@@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
364 end = (unsigned long)__va(end); 398 end = (unsigned long)__va(end);
365 399
366 for (; start < end; start = next) { 400 for (; start < end; start = next) {
367 unsigned long pud_phys;
368 pgd_t *pgd = pgd_offset_k(start); 401 pgd_t *pgd = pgd_offset_k(start);
402 unsigned long pud_phys;
369 pud_t *pud; 403 pud_t *pud;
370 404
371 if (after_bootmem) 405 if (after_bootmem)
@@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
374 pud = alloc_low_page(&pud_phys); 408 pud = alloc_low_page(&pud_phys);
375 409
376 next = start + PGDIR_SIZE; 410 next = start + PGDIR_SIZE;
377 if (next > end) 411 if (next > end)
378 next = end; 412 next = end;
379 phys_pud_init(pud, __pa(start), __pa(next)); 413 phys_pud_init(pud, __pa(start), __pa(next));
380 if (!after_bootmem) 414 if (!after_bootmem)
381 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 415 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
382 unmap_low_page(pud); 416 unmap_low_page(pud);
383 } 417 }
384 418
385 if (!after_bootmem) 419 if (!after_bootmem)
386 mmu_cr4_features = read_cr4(); 420 mmu_cr4_features = read_cr4();
387 __flush_tlb_all(); 421 __flush_tlb_all();
422
423 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
388} 424}
389 425
390#ifndef CONFIG_NUMA 426#ifndef CONFIG_NUMA
391void __init paging_init(void) 427void __init paging_init(void)
392{ 428{
393 unsigned long max_zone_pfns[MAX_NR_ZONES]; 429 unsigned long max_zone_pfns[MAX_NR_ZONES];
430
394 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 431 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 432 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 433 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -402,39 +439,48 @@ void __init paging_init(void)
402} 439}
403#endif 440#endif
404 441
405/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches 442/*
406 from the CPU leading to inconsistent cache lines. address and size 443 * Unmap a kernel mapping if it exists. This is useful to avoid
407 must be aligned to 2MB boundaries. 444 * prefetches from the CPU leading to inconsistent cache lines.
408 Does nothing when the mapping doesn't exist. */ 445 * address and size must be aligned to 2MB boundaries.
409void __init clear_kernel_mapping(unsigned long address, unsigned long size) 446 * Does nothing when the mapping doesn't exist.
447 */
448void __init clear_kernel_mapping(unsigned long address, unsigned long size)
410{ 449{
411 unsigned long end = address + size; 450 unsigned long end = address + size;
412 451
413 BUG_ON(address & ~LARGE_PAGE_MASK); 452 BUG_ON(address & ~LARGE_PAGE_MASK);
414 BUG_ON(size & ~LARGE_PAGE_MASK); 453 BUG_ON(size & ~LARGE_PAGE_MASK);
415 454
416 for (; address < end; address += LARGE_PAGE_SIZE) { 455 for (; address < end; address += LARGE_PAGE_SIZE) {
417 pgd_t *pgd = pgd_offset_k(address); 456 pgd_t *pgd = pgd_offset_k(address);
418 pud_t *pud; 457 pud_t *pud;
419 pmd_t *pmd; 458 pmd_t *pmd;
459
420 if (pgd_none(*pgd)) 460 if (pgd_none(*pgd))
421 continue; 461 continue;
462
422 pud = pud_offset(pgd, address); 463 pud = pud_offset(pgd, address);
423 if (pud_none(*pud)) 464 if (pud_none(*pud))
424 continue; 465 continue;
466
425 pmd = pmd_offset(pud, address); 467 pmd = pmd_offset(pud, address);
426 if (!pmd || pmd_none(*pmd)) 468 if (!pmd || pmd_none(*pmd))
427 continue; 469 continue;
428 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 470
429 /* Could handle this, but it should not happen currently. */ 471 if (!(pmd_val(*pmd) & _PAGE_PSE)) {
430 printk(KERN_ERR 472 /*
431 "clear_kernel_mapping: mapping has been split. will leak memory\n"); 473 * Could handle this, but it should not happen
432 pmd_ERROR(*pmd); 474 * currently:
475 */
476 printk(KERN_ERR "clear_kernel_mapping: "
477 "mapping has been split. will leak memory\n");
478 pmd_ERROR(*pmd);
433 } 479 }
434 set_pmd(pmd, __pmd(0)); 480 set_pmd(pmd, __pmd(0));
435 } 481 }
436 __flush_tlb_all(); 482 __flush_tlb_all();
437} 483}
438 484
439/* 485/*
440 * Memory hotplug specific functions 486 * Memory hotplug specific functions
@@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
461 unsigned long nr_pages = size >> PAGE_SHIFT; 507 unsigned long nr_pages = size >> PAGE_SHIFT;
462 int ret; 508 int ret;
463 509
464 init_memory_mapping(start, (start + size -1)); 510 init_memory_mapping(start, start + size-1);
465 511
466 ret = __add_pages(zone, start_pfn, nr_pages); 512 ret = __add_pages(zone, start_pfn, nr_pages);
467 if (ret) 513 WARN_ON(1);
468 goto error;
469 514
470 return ret; 515 return ret;
471error:
472 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 return ret;
474} 516}
475EXPORT_SYMBOL_GPL(arch_add_memory); 517EXPORT_SYMBOL_GPL(arch_add_memory);
476 518
@@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
484 526
485#endif /* CONFIG_MEMORY_HOTPLUG */ 527#endif /* CONFIG_MEMORY_HOTPLUG */
486 528
487#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 529static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
488/* 530 kcore_modules, kcore_vsyscall;
489 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
490 * just online the pages.
491 */
492int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
493{
494 int err = -EIO;
495 unsigned long pfn;
496 unsigned long total = 0, mem = 0;
497 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
498 if (pfn_valid(pfn)) {
499 online_page(pfn_to_page(pfn));
500 err = 0;
501 mem++;
502 }
503 total++;
504 }
505 if (!err) {
506 z->spanned_pages += total;
507 z->present_pages += mem;
508 z->zone_pgdat->node_spanned_pages += total;
509 z->zone_pgdat->node_present_pages += mem;
510 }
511 return err;
512}
513#endif
514
515static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
516 kcore_vsyscall;
517 531
518void __init mem_init(void) 532void __init mem_init(void)
519{ 533{
@@ -521,8 +535,15 @@ void __init mem_init(void)
521 535
522 pci_iommu_alloc(); 536 pci_iommu_alloc();
523 537
524 /* clear the zero-page */ 538 /* clear_bss() already clear the empty_zero_page */
525 memset(empty_zero_page, 0, PAGE_SIZE); 539
540 /* temporary debugging - double check it's true: */
541 {
542 int i;
543
544 for (i = 0; i < 1024; i++)
545 WARN_ON_ONCE(empty_zero_page[i]);
546 }
526 547
527 reservedpages = 0; 548 reservedpages = 0;
528 549
@@ -534,7 +555,6 @@ void __init mem_init(void)
534#endif 555#endif
535 reservedpages = end_pfn - totalram_pages - 556 reservedpages = end_pfn - totalram_pages -
536 absent_pages_in_range(0, end_pfn); 557 absent_pages_in_range(0, end_pfn);
537
538 after_bootmem = 1; 558 after_bootmem = 1;
539 559
540 codesize = (unsigned long) &_etext - (unsigned long) &_text; 560 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -542,15 +562,16 @@ void __init mem_init(void)
542 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 562 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
543 563
544 /* Register memory areas for /proc/kcore */ 564 /* Register memory areas for /proc/kcore */
545 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 565 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
546 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 566 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
547 VMALLOC_END-VMALLOC_START); 567 VMALLOC_END-VMALLOC_START);
548 kclist_add(&kcore_kernel, &_stext, _end - _stext); 568 kclist_add(&kcore_kernel, &_stext, _end - _stext);
549 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); 569 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
550 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 570 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
551 VSYSCALL_END - VSYSCALL_START); 571 VSYSCALL_END - VSYSCALL_START);
552 572
553 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", 573 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
574 "%ldk reserved, %ldk data, %ldk init)\n",
554 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 575 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
555 end_pfn << (PAGE_SHIFT-10), 576 end_pfn << (PAGE_SHIFT-10),
556 codesize >> 10, 577 codesize >> 10,
@@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
566 if (begin >= end) 587 if (begin >= end)
567 return; 588 return;
568 589
590 /*
591 * If debugging page accesses then do not free this memory but
592 * mark them not present - any buggy init-section access will
593 * create a kernel page fault:
594 */
595#ifdef CONFIG_DEBUG_PAGEALLOC
596 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
597 begin, PAGE_ALIGN(end));
598 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
599#else
569 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 600 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
601
570 for (addr = begin; addr < end; addr += PAGE_SIZE) { 602 for (addr = begin; addr < end; addr += PAGE_SIZE) {
571 ClearPageReserved(virt_to_page(addr)); 603 ClearPageReserved(virt_to_page(addr));
572 init_page_count(virt_to_page(addr)); 604 init_page_count(virt_to_page(addr));
573 memset((void *)(addr & ~(PAGE_SIZE-1)), 605 memset((void *)(addr & ~(PAGE_SIZE-1)),
574 POISON_FREE_INITMEM, PAGE_SIZE); 606 POISON_FREE_INITMEM, PAGE_SIZE);
575 if (addr >= __START_KERNEL_map)
576 change_page_attr_addr(addr, 1, __pgprot(0));
577 free_page(addr); 607 free_page(addr);
578 totalram_pages++; 608 totalram_pages++;
579 } 609 }
580 if (addr > __START_KERNEL_map) 610#endif
581 global_flush_tlb();
582} 611}
583 612
584void free_initmem(void) 613void free_initmem(void)
@@ -589,6 +618,8 @@ void free_initmem(void)
589} 618}
590 619
591#ifdef CONFIG_DEBUG_RODATA 620#ifdef CONFIG_DEBUG_RODATA
621const int rodata_test_data = 0xC3;
622EXPORT_SYMBOL_GPL(rodata_test_data);
592 623
593void mark_rodata_ro(void) 624void mark_rodata_ro(void)
594{ 625{
@@ -603,25 +634,27 @@ void mark_rodata_ro(void)
603#ifdef CONFIG_KPROBES 634#ifdef CONFIG_KPROBES
604 start = (unsigned long)__start_rodata; 635 start = (unsigned long)__start_rodata;
605#endif 636#endif
606 637
607 end = (unsigned long)__end_rodata; 638 end = (unsigned long)__end_rodata;
608 start = (start + PAGE_SIZE - 1) & PAGE_MASK; 639 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
609 end &= PAGE_MASK; 640 end &= PAGE_MASK;
610 if (end <= start) 641 if (end <= start)
611 return; 642 return;
612 643
613 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); 644 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
614 645
615 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 646 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
616 (end - start) >> 10); 647 (end - start) >> 10);
617 648
618 /* 649 rodata_test();
619 * change_page_attr_addr() requires a global_flush_tlb() call after it. 650
620 * We do this after the printk so that if something went wrong in the 651#ifdef CONFIG_CPA_DEBUG
621 * change, the printk gets out at least to give a better debug hint 652 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
622 * of who is the culprit. 653 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
623 */ 654
624 global_flush_tlb(); 655 printk(KERN_INFO "Testing CPA: again\n");
656 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
657#endif
625} 658}
626#endif 659#endif
627 660
@@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end)
632} 665}
633#endif 666#endif
634 667
635void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 668void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
636{ 669{
637#ifdef CONFIG_NUMA 670#ifdef CONFIG_NUMA
638 int nid = phys_to_nid(phys); 671 int nid = phys_to_nid(phys);
639#endif 672#endif
640 unsigned long pfn = phys >> PAGE_SHIFT; 673 unsigned long pfn = phys >> PAGE_SHIFT;
674
641 if (pfn >= end_pfn) { 675 if (pfn >= end_pfn) {
642 /* This can happen with kdump kernels when accessing firmware 676 /*
643 tables. */ 677 * This can happen with kdump kernels when accessing
678 * firmware tables:
679 */
644 if (pfn < end_pfn_map) 680 if (pfn < end_pfn_map)
645 return; 681 return;
682
646 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 683 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
647 phys, len); 684 phys, len);
648 return; 685 return;
@@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
650 687
651 /* Should check here against the e820 map to avoid double free */ 688 /* Should check here against the e820 map to avoid double free */
652#ifdef CONFIG_NUMA 689#ifdef CONFIG_NUMA
653 reserve_bootmem_node(NODE_DATA(nid), phys, len); 690 reserve_bootmem_node(NODE_DATA(nid), phys, len);
654#else 691#else
655 reserve_bootmem(phys, len); 692 reserve_bootmem(phys, len);
656#endif 693#endif
657 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { 694 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
658 dma_reserve += len / PAGE_SIZE; 695 dma_reserve += len / PAGE_SIZE;
@@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
660 } 697 }
661} 698}
662 699
663int kern_addr_valid(unsigned long addr) 700int kern_addr_valid(unsigned long addr)
664{ 701{
665 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 702 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
666 pgd_t *pgd; 703 pgd_t *pgd;
667 pud_t *pud; 704 pud_t *pud;
668 pmd_t *pmd; 705 pmd_t *pmd;
669 pte_t *pte; 706 pte_t *pte;
670 707
671 if (above != 0 && above != -1UL) 708 if (above != 0 && above != -1UL)
672 return 0; 709 return 0;
673 710
674 pgd = pgd_offset_k(addr); 711 pgd = pgd_offset_k(addr);
675 if (pgd_none(*pgd)) 712 if (pgd_none(*pgd))
676 return 0; 713 return 0;
677 714
678 pud = pud_offset(pgd, addr); 715 pud = pud_offset(pgd, addr);
679 if (pud_none(*pud)) 716 if (pud_none(*pud))
680 return 0; 717 return 0;
681 718
682 pmd = pmd_offset(pud, addr); 719 pmd = pmd_offset(pud, addr);
683 if (pmd_none(*pmd)) 720 if (pmd_none(*pmd))
684 return 0; 721 return 0;
722
685 if (pmd_large(*pmd)) 723 if (pmd_large(*pmd))
686 return pfn_valid(pmd_pfn(*pmd)); 724 return pfn_valid(pmd_pfn(*pmd));
687 725
688 pte = pte_offset_kernel(pmd, addr); 726 pte = pte_offset_kernel(pmd, addr);
689 if (pte_none(*pte)) 727 if (pte_none(*pte))
690 return 0; 728 return 0;
729
691 return pfn_valid(pte_pfn(*pte)); 730 return pfn_valid(pte_pfn(*pte));
692} 731}
693 732
694/* A pseudo VMA to allow ptrace access for the vsyscall page. This only 733/*
695 covers the 64bit vsyscall page now. 32bit has a real VMA now and does 734 * A pseudo VMA to allow ptrace access for the vsyscall page. This only
696 not need special handling anymore. */ 735 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
697 736 * not need special handling anymore:
737 */
698static struct vm_area_struct gate_vma = { 738static struct vm_area_struct gate_vma = {
699 .vm_start = VSYSCALL_START, 739 .vm_start = VSYSCALL_START,
700 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), 740 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
701 .vm_page_prot = PAGE_READONLY_EXEC, 741 .vm_page_prot = PAGE_READONLY_EXEC,
702 .vm_flags = VM_READ | VM_EXEC 742 .vm_flags = VM_READ | VM_EXEC
703}; 743};
704 744
705struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 745struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
714int in_gate_area(struct task_struct *task, unsigned long addr) 754int in_gate_area(struct task_struct *task, unsigned long addr)
715{ 755{
716 struct vm_area_struct *vma = get_gate_vma(task); 756 struct vm_area_struct *vma = get_gate_vma(task);
757
717 if (!vma) 758 if (!vma)
718 return 0; 759 return 0;
760
719 return (addr >= vma->vm_start) && (addr < vma->vm_end); 761 return (addr >= vma->vm_start) && (addr < vma->vm_end);
720} 762}
721 763
722/* Use this when you have no reliable task/vma, typically from interrupt 764/*
723 * context. It is less reliable than using the task's vma and may give 765 * Use this when you have no reliable task/vma, typically from interrupt
724 * false positives. 766 * context. It is less reliable than using the task's vma and may give
767 * false positives:
725 */ 768 */
726int in_gate_area_no_task(unsigned long addr) 769int in_gate_area_no_task(unsigned long addr)
727{ 770{
@@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma)
741/* 784/*
742 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 785 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
743 */ 786 */
744int __meminit vmemmap_populate(struct page *start_page, 787int __meminit
745 unsigned long size, int node) 788vmemmap_populate(struct page *start_page, unsigned long size, int node)
746{ 789{
747 unsigned long addr = (unsigned long)start_page; 790 unsigned long addr = (unsigned long)start_page;
748 unsigned long end = (unsigned long)(start_page + size); 791 unsigned long end = (unsigned long)(start_page + size);
@@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page,
757 pgd = vmemmap_pgd_populate(addr, node); 800 pgd = vmemmap_pgd_populate(addr, node);
758 if (!pgd) 801 if (!pgd)
759 return -ENOMEM; 802 return -ENOMEM;
803
760 pud = vmemmap_pud_populate(pgd, addr, node); 804 pud = vmemmap_pud_populate(pgd, addr, node);
761 if (!pud) 805 if (!pud)
762 return -ENOMEM; 806 return -ENOMEM;
@@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page,
764 pmd = pmd_offset(pud, addr); 808 pmd = pmd_offset(pud, addr);
765 if (pmd_none(*pmd)) { 809 if (pmd_none(*pmd)) {
766 pte_t entry; 810 pte_t entry;
767 void *p = vmemmap_alloc_block(PMD_SIZE, node); 811 void *p;
812
813 p = vmemmap_alloc_block(PMD_SIZE, node);
768 if (!p) 814 if (!p)
769 return -ENOMEM; 815 return -ENOMEM;
770 816
771 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 817 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
772 mk_pte_huge(entry); 818 PAGE_KERNEL_LARGE);
773 set_pmd(pmd, __pmd(pte_val(entry))); 819 set_pmd(pmd, __pmd(pte_val(entry)));
774 820
775 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", 821 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
776 addr, addr + PMD_SIZE - 1, p, node); 822 addr, addr + PMD_SIZE - 1, p, node);
777 } else 823 } else {
778 vmemmap_verify((pte_t *)pmd, node, addr, next); 824 vmemmap_verify((pte_t *)pmd, node, addr, next);
825 }
779 } 826 }
780
781 return 0; 827 return 0;
782} 828}
783#endif 829#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 000000000000..ed795721ca8e
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,501 @@
1/*
2 * Re-map IO memory to kernel address space so that we can access it.
3 * This is needed for high PCI addresses that aren't mapped in the
4 * 640k-1MB IO memory area on PC's
5 *
6 * (C) Copyright 1995 1996 Linus Torvalds
7 */
8
9#include <linux/bootmem.h>
10#include <linux/init.h>
11#include <linux/io.h>
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
15
16#include <asm/cacheflush.h>
17#include <asm/e820.h>
18#include <asm/fixmap.h>
19#include <asm/pgtable.h>
20#include <asm/tlbflush.h>
21#include <asm/pgalloc.h>
22
23enum ioremap_mode {
24 IOR_MODE_UNCACHED,
25 IOR_MODE_CACHED,
26};
27
28#ifdef CONFIG_X86_64
29
30unsigned long __phys_addr(unsigned long x)
31{
32 if (x >= __START_KERNEL_map)
33 return x - __START_KERNEL_map + phys_base;
34 return x - PAGE_OFFSET;
35}
36EXPORT_SYMBOL(__phys_addr);
37
38#endif
39
40int page_is_ram(unsigned long pagenr)
41{
42 unsigned long addr, end;
43 int i;
44
45 for (i = 0; i < e820.nr_map; i++) {
46 /*
47 * Not usable memory:
48 */
49 if (e820.map[i].type != E820_RAM)
50 continue;
51 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
52 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
53
54 /*
55 * Sanity check: Some BIOSen report areas as RAM that
56 * are not. Notably the 640->1Mb area, which is the
57 * PCI BIOS area.
58 */
59 if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
60 end < (BIOS_END >> PAGE_SHIFT))
61 continue;
62
63 if ((pagenr >= addr) && (pagenr < end))
64 return 1;
65 }
66 return 0;
67}
68
69/*
70 * Fix up the linear direct mapping of the kernel to avoid cache attribute
71 * conflicts.
72 */
73static int ioremap_change_attr(unsigned long paddr, unsigned long size,
74 enum ioremap_mode mode)
75{
76 unsigned long vaddr = (unsigned long)__va(paddr);
77 unsigned long nrpages = size >> PAGE_SHIFT;
78 int err, level;
79
80 /* No change for pages after the last mapping */
81 if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
82 return 0;
83
84 /*
85 * If there is no identity map for this address,
86 * change_page_attr_addr is unnecessary
87 */
88 if (!lookup_address(vaddr, &level))
89 return 0;
90
91 switch (mode) {
92 case IOR_MODE_UNCACHED:
93 default:
94 err = set_memory_uc(vaddr, nrpages);
95 break;
96 case IOR_MODE_CACHED:
97 err = set_memory_wb(vaddr, nrpages);
98 break;
99 }
100
101 return err;
102}
103
104/*
105 * Remap an arbitrary physical address space into the kernel virtual
106 * address space. Needed when the kernel wants to access high addresses
107 * directly.
108 *
109 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
110 * have to convert them into an offset in a page-aligned mapping, but the
111 * caller shouldn't need to know that small detail.
112 */
113static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
114 enum ioremap_mode mode)
115{
116 void __iomem *addr;
117 struct vm_struct *area;
118 unsigned long offset, last_addr;
119 pgprot_t prot;
120
121 /* Don't allow wraparound or zero size */
122 last_addr = phys_addr + size - 1;
123 if (!size || last_addr < phys_addr)
124 return NULL;
125
126 /*
127 * Don't remap the low PCI/ISA area, it's always mapped..
128 */
129 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
130 return (__force void __iomem *)phys_to_virt(phys_addr);
131
132 /*
133 * Don't allow anybody to remap normal RAM that we're using..
134 */
135 for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
136 (offset << PAGE_SHIFT) < last_addr; offset++) {
137 if (page_is_ram(offset))
138 return NULL;
139 }
140
141 switch (mode) {
142 case IOR_MODE_UNCACHED:
143 default:
144 prot = PAGE_KERNEL_NOCACHE;
145 break;
146 case IOR_MODE_CACHED:
147 prot = PAGE_KERNEL;
148 break;
149 }
150
151 /*
152 * Mappings have to be page-aligned
153 */
154 offset = phys_addr & ~PAGE_MASK;
155 phys_addr &= PAGE_MASK;
156 size = PAGE_ALIGN(last_addr+1) - phys_addr;
157
158 /*
159 * Ok, go for it..
160 */
161 area = get_vm_area(size, VM_IOREMAP);
162 if (!area)
163 return NULL;
164 area->phys_addr = phys_addr;
165 addr = (void __iomem *) area->addr;
166 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
167 phys_addr, prot)) {
168 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
169 return NULL;
170 }
171
172 if (ioremap_change_attr(phys_addr, size, mode) < 0) {
173 vunmap(addr);
174 return NULL;
175 }
176
177 return (void __iomem *) (offset + (char __iomem *)addr);
178}
179
180/**
181 * ioremap_nocache - map bus memory into CPU space
182 * @offset: bus address of the memory
183 * @size: size of the resource to map
184 *
185 * ioremap_nocache performs a platform specific sequence of operations to
186 * make bus memory CPU accessible via the readb/readw/readl/writeb/
187 * writew/writel functions and the other mmio helpers. The returned
188 * address is not guaranteed to be usable directly as a virtual
189 * address.
190 *
191 * This version of ioremap ensures that the memory is marked uncachable
192 * on the CPU as well as honouring existing caching rules from things like
193 * the PCI bus. Note that there are other caches and buffers on many
194 * busses. In particular driver authors should read up on PCI writes
195 *
196 * It's useful if some control registers are in such an area and
197 * write combining or read caching is not desirable:
198 *
199 * Must be freed with iounmap.
200 */
201void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
202{
203 return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
204}
205EXPORT_SYMBOL(ioremap_nocache);
206
207void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
208{
209 return __ioremap(phys_addr, size, IOR_MODE_CACHED);
210}
211EXPORT_SYMBOL(ioremap_cache);
212
213/**
214 * iounmap - Free a IO remapping
215 * @addr: virtual address from ioremap_*
216 *
217 * Caller must ensure there is only one unmapping for the same pointer.
218 */
219void iounmap(volatile void __iomem *addr)
220{
221 struct vm_struct *p, *o;
222
223 if ((void __force *)addr <= high_memory)
224 return;
225
226 /*
227 * __ioremap special-cases the PCI/ISA range by not instantiating a
228 * vm_area and by simply returning an address into the kernel mapping
229 * of ISA space. So handle that here.
230 */
231 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
232 addr < phys_to_virt(ISA_END_ADDRESS))
233 return;
234
235 addr = (volatile void __iomem *)
236 (PAGE_MASK & (unsigned long __force)addr);
237
238 /* Use the vm area unlocked, assuming the caller
239 ensures there isn't another iounmap for the same address
240 in parallel. Reuse of the virtual address is prevented by
241 leaving it in the global lists until we're done with it.
242 cpa takes care of the direct mappings. */
243 read_lock(&vmlist_lock);
244 for (p = vmlist; p; p = p->next) {
245 if (p->addr == addr)
246 break;
247 }
248 read_unlock(&vmlist_lock);
249
250 if (!p) {
251 printk(KERN_ERR "iounmap: bad address %p\n", addr);
252 dump_stack();
253 return;
254 }
255
256 /* Reset the direct mapping. Can block */
257 ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
258
259 /* Finally remove it */
260 o = remove_vm_area((void *)addr);
261 BUG_ON(p != o || o == NULL);
262 kfree(p);
263}
264EXPORT_SYMBOL(iounmap);
265
266#ifdef CONFIG_X86_32
267
268int __initdata early_ioremap_debug;
269
270static int __init early_ioremap_debug_setup(char *str)
271{
272 early_ioremap_debug = 1;
273
274 return 0;
275}
276early_param("early_ioremap_debug", early_ioremap_debug_setup);
277
278static __initdata int after_paging_init;
279static __initdata unsigned long bm_pte[1024]
280 __attribute__((aligned(PAGE_SIZE)));
281
282static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
283{
284 return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
285}
286
287static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
288{
289 return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
290}
291
292void __init early_ioremap_init(void)
293{
294 unsigned long *pgd;
295
296 if (early_ioremap_debug)
297 printk(KERN_INFO "early_ioremap_init()\n");
298
299 pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
300 *pgd = __pa(bm_pte) | _PAGE_TABLE;
301 memset(bm_pte, 0, sizeof(bm_pte));
302 /*
303 * The boot-ioremap range spans multiple pgds, for which
304 * we are not prepared:
305 */
306 if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
307 WARN_ON(1);
308 printk(KERN_WARNING "pgd %p != %p\n",
309 pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
310 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
311 fix_to_virt(FIX_BTMAP_BEGIN));
312 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
313 fix_to_virt(FIX_BTMAP_END));
314
315 printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
316 printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
317 FIX_BTMAP_BEGIN);
318 }
319}
320
321void __init early_ioremap_clear(void)
322{
323 unsigned long *pgd;
324
325 if (early_ioremap_debug)
326 printk(KERN_INFO "early_ioremap_clear()\n");
327
328 pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
329 *pgd = 0;
330 paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
331 __flush_tlb_all();
332}
333
334void __init early_ioremap_reset(void)
335{
336 enum fixed_addresses idx;
337 unsigned long *pte, phys, addr;
338
339 after_paging_init = 1;
340 for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
341 addr = fix_to_virt(idx);
342 pte = early_ioremap_pte(addr);
343 if (!*pte & _PAGE_PRESENT) {
344 phys = *pte & PAGE_MASK;
345 set_fixmap(idx, phys);
346 }
347 }
348}
349
350static void __init __early_set_fixmap(enum fixed_addresses idx,
351 unsigned long phys, pgprot_t flags)
352{
353 unsigned long *pte, addr = __fix_to_virt(idx);
354
355 if (idx >= __end_of_fixed_addresses) {
356 BUG();
357 return;
358 }
359 pte = early_ioremap_pte(addr);
360 if (pgprot_val(flags))
361 *pte = (phys & PAGE_MASK) | pgprot_val(flags);
362 else
363 *pte = 0;
364 __flush_tlb_one(addr);
365}
366
367static inline void __init early_set_fixmap(enum fixed_addresses idx,
368 unsigned long phys)
369{
370 if (after_paging_init)
371 set_fixmap(idx, phys);
372 else
373 __early_set_fixmap(idx, phys, PAGE_KERNEL);
374}
375
376static inline void __init early_clear_fixmap(enum fixed_addresses idx)
377{
378 if (after_paging_init)
379 clear_fixmap(idx);
380 else
381 __early_set_fixmap(idx, 0, __pgprot(0));
382}
383
384
385int __initdata early_ioremap_nested;
386
387static int __init check_early_ioremap_leak(void)
388{
389 if (!early_ioremap_nested)
390 return 0;
391
392 printk(KERN_WARNING
393 "Debug warning: early ioremap leak of %d areas detected.\n",
394 early_ioremap_nested);
395 printk(KERN_WARNING
396 "please boot with early_ioremap_debug and report the dmesg.\n");
397 WARN_ON(1);
398
399 return 1;
400}
401late_initcall(check_early_ioremap_leak);
402
403void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
404{
405 unsigned long offset, last_addr;
406 unsigned int nrpages, nesting;
407 enum fixed_addresses idx0, idx;
408
409 WARN_ON(system_state != SYSTEM_BOOTING);
410
411 nesting = early_ioremap_nested;
412 if (early_ioremap_debug) {
413 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
414 phys_addr, size, nesting);
415 dump_stack();
416 }
417
418 /* Don't allow wraparound or zero size */
419 last_addr = phys_addr + size - 1;
420 if (!size || last_addr < phys_addr) {
421 WARN_ON(1);
422 return NULL;
423 }
424
425 if (nesting >= FIX_BTMAPS_NESTING) {
426 WARN_ON(1);
427 return NULL;
428 }
429 early_ioremap_nested++;
430 /*
431 * Mappings have to be page-aligned
432 */
433 offset = phys_addr & ~PAGE_MASK;
434 phys_addr &= PAGE_MASK;
435 size = PAGE_ALIGN(last_addr) - phys_addr;
436
437 /*
438 * Mappings have to fit in the FIX_BTMAP area.
439 */
440 nrpages = size >> PAGE_SHIFT;
441 if (nrpages > NR_FIX_BTMAPS) {
442 WARN_ON(1);
443 return NULL;
444 }
445
446 /*
447 * Ok, go for it..
448 */
449 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
450 idx = idx0;
451 while (nrpages > 0) {
452 early_set_fixmap(idx, phys_addr);
453 phys_addr += PAGE_SIZE;
454 --idx;
455 --nrpages;
456 }
457 if (early_ioremap_debug)
458 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
459
460 return (void *) (offset + fix_to_virt(idx0));
461}
462
463void __init early_iounmap(void *addr, unsigned long size)
464{
465 unsigned long virt_addr;
466 unsigned long offset;
467 unsigned int nrpages;
468 enum fixed_addresses idx;
469 unsigned int nesting;
470
471 nesting = --early_ioremap_nested;
472 WARN_ON(nesting < 0);
473
474 if (early_ioremap_debug) {
475 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
476 size, nesting);
477 dump_stack();
478 }
479
480 virt_addr = (unsigned long)addr;
481 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
482 WARN_ON(1);
483 return;
484 }
485 offset = virt_addr & ~PAGE_MASK;
486 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
487
488 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
489 while (nrpages > 0) {
490 early_clear_fixmap(idx);
491 --idx;
492 --nrpages;
493 }
494}
495
496void __this_fixmap_does_not_exist(void)
497{
498 WARN_ON(1);
499}
500
501#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
deleted file mode 100644
index 0b278315d737..000000000000
--- a/arch/x86/mm/ioremap_32.c
+++ /dev/null
@@ -1,274 +0,0 @@
1/*
2 * arch/i386/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/pgtable.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24/*
25 * Generic mapping function (not visible outside):
26 */
27
28/*
29 * Remap an arbitrary physical address space into the kernel virtual
30 * address space. Needed when the kernel wants to access high addresses
31 * directly.
32 *
33 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
34 * have to convert them into an offset in a page-aligned mapping, but the
35 * caller shouldn't need to know that small detail.
36 */
37void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
38{
39 void __iomem * addr;
40 struct vm_struct * area;
41 unsigned long offset, last_addr;
42 pgprot_t prot;
43
44 /* Don't allow wraparound or zero size */
45 last_addr = phys_addr + size - 1;
46 if (!size || last_addr < phys_addr)
47 return NULL;
48
49 /*
50 * Don't remap the low PCI/ISA area, it's always mapped..
51 */
52 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
53 return (void __iomem *) phys_to_virt(phys_addr);
54
55 /*
56 * Don't allow anybody to remap normal RAM that we're using..
57 */
58 if (phys_addr <= virt_to_phys(high_memory - 1)) {
59 char *t_addr, *t_end;
60 struct page *page;
61
62 t_addr = __va(phys_addr);
63 t_end = t_addr + (size - 1);
64
65 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
66 if(!PageReserved(page))
67 return NULL;
68 }
69
70 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
71 | _PAGE_ACCESSED | flags);
72
73 /*
74 * Mappings have to be page-aligned
75 */
76 offset = phys_addr & ~PAGE_MASK;
77 phys_addr &= PAGE_MASK;
78 size = PAGE_ALIGN(last_addr+1) - phys_addr;
79
80 /*
81 * Ok, go for it..
82 */
83 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
84 if (!area)
85 return NULL;
86 area->phys_addr = phys_addr;
87 addr = (void __iomem *) area->addr;
88 if (ioremap_page_range((unsigned long) addr,
89 (unsigned long) addr + size, phys_addr, prot)) {
90 vunmap((void __force *) addr);
91 return NULL;
92 }
93 return (void __iomem *) (offset + (char __iomem *)addr);
94}
95EXPORT_SYMBOL(__ioremap);
96
97/**
98 * ioremap_nocache - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap_nocache performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * This version of ioremap ensures that the memory is marked uncachable
109 * on the CPU as well as honouring existing caching rules from things like
110 * the PCI bus. Note that there are other caches and buffers on many
111 * busses. In particular driver authors should read up on PCI writes
112 *
113 * It's useful if some control registers are in such an area and
114 * write combining or read caching is not desirable:
115 *
116 * Must be freed with iounmap.
117 */
118
119void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
120{
121 unsigned long last_addr;
122 void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
123 if (!p)
124 return p;
125
126 /* Guaranteed to be > phys_addr, as per __ioremap() */
127 last_addr = phys_addr + size - 1;
128
129 if (last_addr < virt_to_phys(high_memory) - 1) {
130 struct page *ppage = virt_to_page(__va(phys_addr));
131 unsigned long npages;
132
133 phys_addr &= PAGE_MASK;
134
135 /* This might overflow and become zero.. */
136 last_addr = PAGE_ALIGN(last_addr);
137
138 /* .. but that's ok, because modulo-2**n arithmetic will make
139 * the page-aligned "last - first" come out right.
140 */
141 npages = (last_addr - phys_addr) >> PAGE_SHIFT;
142
143 if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
144 iounmap(p);
145 p = NULL;
146 }
147 global_flush_tlb();
148 }
149
150 return p;
151}
152EXPORT_SYMBOL(ioremap_nocache);
153
154/**
155 * iounmap - Free a IO remapping
156 * @addr: virtual address from ioremap_*
157 *
158 * Caller must ensure there is only one unmapping for the same pointer.
159 */
160void iounmap(volatile void __iomem *addr)
161{
162 struct vm_struct *p, *o;
163
164 if ((void __force *)addr <= high_memory)
165 return;
166
167 /*
168 * __ioremap special-cases the PCI/ISA range by not instantiating a
169 * vm_area and by simply returning an address into the kernel mapping
170 * of ISA space. So handle that here.
171 */
172 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
173 addr < phys_to_virt(ISA_END_ADDRESS))
174 return;
175
176 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
177
178 /* Use the vm area unlocked, assuming the caller
179 ensures there isn't another iounmap for the same address
180 in parallel. Reuse of the virtual address is prevented by
181 leaving it in the global lists until we're done with it.
182 cpa takes care of the direct mappings. */
183 read_lock(&vmlist_lock);
184 for (p = vmlist; p; p = p->next) {
185 if (p->addr == addr)
186 break;
187 }
188 read_unlock(&vmlist_lock);
189
190 if (!p) {
191 printk("iounmap: bad address %p\n", addr);
192 dump_stack();
193 return;
194 }
195
196 /* Reset the direct mapping. Can block */
197 if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
198 change_page_attr(virt_to_page(__va(p->phys_addr)),
199 get_vm_area_size(p) >> PAGE_SHIFT,
200 PAGE_KERNEL);
201 global_flush_tlb();
202 }
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
211void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
212{
213 unsigned long offset, last_addr;
214 unsigned int nrpages;
215 enum fixed_addresses idx;
216
217 /* Don't allow wraparound or zero size */
218 last_addr = phys_addr + size - 1;
219 if (!size || last_addr < phys_addr)
220 return NULL;
221
222 /*
223 * Don't remap the low PCI/ISA area, it's always mapped..
224 */
225 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
226 return phys_to_virt(phys_addr);
227
228 /*
229 * Mappings have to be page-aligned
230 */
231 offset = phys_addr & ~PAGE_MASK;
232 phys_addr &= PAGE_MASK;
233 size = PAGE_ALIGN(last_addr) - phys_addr;
234
235 /*
236 * Mappings have to fit in the FIX_BTMAP area.
237 */
238 nrpages = size >> PAGE_SHIFT;
239 if (nrpages > NR_FIX_BTMAPS)
240 return NULL;
241
242 /*
243 * Ok, go for it..
244 */
245 idx = FIX_BTMAP_BEGIN;
246 while (nrpages > 0) {
247 set_fixmap(idx, phys_addr);
248 phys_addr += PAGE_SIZE;
249 --idx;
250 --nrpages;
251 }
252 return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
253}
254
255void __init bt_iounmap(void *addr, unsigned long size)
256{
257 unsigned long virt_addr;
258 unsigned long offset;
259 unsigned int nrpages;
260 enum fixed_addresses idx;
261
262 virt_addr = (unsigned long)addr;
263 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
264 return;
265 offset = virt_addr & ~PAGE_MASK;
266 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
267
268 idx = FIX_BTMAP_BEGIN;
269 while (nrpages > 0) {
270 clear_fixmap(idx);
271 --idx;
272 --nrpages;
273 }
274}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
deleted file mode 100644
index 6cac90aa5032..000000000000
--- a/arch/x86/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17#include <asm/pgalloc.h>
18#include <asm/fixmap.h>
19#include <asm/tlbflush.h>
20#include <asm/cacheflush.h>
21#include <asm/proto.h>
22
23unsigned long __phys_addr(unsigned long x)
24{
25 if (x >= __START_KERNEL_map)
26 return x - __START_KERNEL_map + phys_base;
27 return x - PAGE_OFFSET;
28}
29EXPORT_SYMBOL(__phys_addr);
30
31#define ISA_START_ADDRESS 0xa0000
32#define ISA_END_ADDRESS 0x100000
33
34/*
35 * Fix up the linear direct mapping of the kernel to avoid cache attribute
36 * conflicts.
37 */
38static int
39ioremap_change_attr(unsigned long phys_addr, unsigned long size,
40 unsigned long flags)
41{
42 int err = 0;
43 if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
44 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
45 unsigned long vaddr = (unsigned long) __va(phys_addr);
46
47 /*
48 * Must use a address here and not struct page because the phys addr
49 * can be a in hole between nodes and not have an memmap entry.
50 */
51 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
52 if (!err)
53 global_flush_tlb();
54 }
55 return err;
56}
57
58/*
59 * Generic mapping function
60 */
61
62/*
63 * Remap an arbitrary physical address space into the kernel virtual
64 * address space. Needed when the kernel wants to access high addresses
65 * directly.
66 *
67 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
68 * have to convert them into an offset in a page-aligned mapping, but the
69 * caller shouldn't need to know that small detail.
70 */
71void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
72{
73 void * addr;
74 struct vm_struct * area;
75 unsigned long offset, last_addr;
76 pgprot_t pgprot;
77
78 /* Don't allow wraparound or zero size */
79 last_addr = phys_addr + size - 1;
80 if (!size || last_addr < phys_addr)
81 return NULL;
82
83 /*
84 * Don't remap the low PCI/ISA area, it's always mapped..
85 */
86 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
87 return (__force void __iomem *)phys_to_virt(phys_addr);
88
89#ifdef CONFIG_FLATMEM
90 /*
91 * Don't allow anybody to remap normal RAM that we're using..
92 */
93 if (last_addr < virt_to_phys(high_memory)) {
94 char *t_addr, *t_end;
95 struct page *page;
96
97 t_addr = __va(phys_addr);
98 t_end = t_addr + (size - 1);
99
100 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
101 if(!PageReserved(page))
102 return NULL;
103 }
104#endif
105
106 pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
107 | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
108 /*
109 * Mappings have to be page-aligned
110 */
111 offset = phys_addr & ~PAGE_MASK;
112 phys_addr &= PAGE_MASK;
113 size = PAGE_ALIGN(last_addr+1) - phys_addr;
114
115 /*
116 * Ok, go for it..
117 */
118 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
119 if (!area)
120 return NULL;
121 area->phys_addr = phys_addr;
122 addr = area->addr;
123 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
124 phys_addr, pgprot)) {
125 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
126 return NULL;
127 }
128 if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
129 area->flags &= 0xffffff;
130 vunmap(addr);
131 return NULL;
132 }
133 return (__force void __iomem *) (offset + (char *)addr);
134}
135EXPORT_SYMBOL(__ioremap);
136
137/**
138 * ioremap_nocache - map bus memory into CPU space
139 * @offset: bus address of the memory
140 * @size: size of the resource to map
141 *
142 * ioremap_nocache performs a platform specific sequence of operations to
143 * make bus memory CPU accessible via the readb/readw/readl/writeb/
144 * writew/writel functions and the other mmio helpers. The returned
145 * address is not guaranteed to be usable directly as a virtual
146 * address.
147 *
148 * This version of ioremap ensures that the memory is marked uncachable
149 * on the CPU as well as honouring existing caching rules from things like
150 * the PCI bus. Note that there are other caches and buffers on many
151 * busses. In particular driver authors should read up on PCI writes
152 *
153 * It's useful if some control registers are in such an area and
154 * write combining or read caching is not desirable:
155 *
156 * Must be freed with iounmap.
157 */
158
159void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
160{
161 return __ioremap(phys_addr, size, _PAGE_PCD);
162}
163EXPORT_SYMBOL(ioremap_nocache);
164
165/**
166 * iounmap - Free a IO remapping
167 * @addr: virtual address from ioremap_*
168 *
169 * Caller must ensure there is only one unmapping for the same pointer.
170 */
171void iounmap(volatile void __iomem *addr)
172{
173 struct vm_struct *p, *o;
174
175 if (addr <= high_memory)
176 return;
177 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
178 addr < phys_to_virt(ISA_END_ADDRESS))
179 return;
180
181 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
182 /* Use the vm area unlocked, assuming the caller
183 ensures there isn't another iounmap for the same address
184 in parallel. Reuse of the virtual address is prevented by
185 leaving it in the global lists until we're done with it.
186 cpa takes care of the direct mappings. */
187 read_lock(&vmlist_lock);
188 for (p = vmlist; p; p = p->next) {
189 if (p->addr == addr)
190 break;
191 }
192 read_unlock(&vmlist_lock);
193
194 if (!p) {
195 printk("iounmap: bad address %p\n", addr);
196 dump_stack();
197 return;
198 }
199
200 /* Reset the direct mapping. Can block */
201 if (p->flags >> 20)
202 ioremap_change_attr(p->phys_addr, p->size, 0);
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index a96006f7ae0c..7a2ebce87df5 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the K8 northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
9#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -22,132 +22,135 @@
22 22
23static __init int find_northbridge(void) 23static __init int find_northbridge(void)
24{ 24{
25 int num; 25 int num;
26 26
27 for (num = 0; num < 32; num++) { 27 for (num = 0; num < 32; num++) {
28 u32 header; 28 u32 header;
29 29
30 header = read_pci_config(0, num, 0, 0x00); 30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) 31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
32 continue; 32 header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
33 33 header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
34 header = read_pci_config(0, num, 1, 0x00); 34 continue;
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) 35
36 continue; 36 header = read_pci_config(0, num, 1, 0x00);
37 return num; 37 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
38 } 38 header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
39 39 header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
40 return -1; 40 continue;
41 return num;
42 }
43
44 return -1;
41} 45}
42 46
43int __init k8_scan_nodes(unsigned long start, unsigned long end) 47int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{ 48{
45 unsigned long prevbase; 49 unsigned long prevbase;
46 struct bootnode nodes[8]; 50 struct bootnode nodes[8];
47 int nodeid, i, j, nb; 51 int nodeid, i, nb;
48 unsigned char nodeids[8]; 52 unsigned char nodeids[8];
49 int found = 0; 53 int found = 0;
50 u32 reg; 54 u32 reg;
51 unsigned numnodes; 55 unsigned numnodes;
52 unsigned num_cores; 56 unsigned cores;
57 unsigned bits;
58 int j;
53 59
54 if (!early_pci_allowed()) 60 if (!early_pci_allowed())
55 return -1; 61 return -1;
56 62
57 nb = find_northbridge(); 63 nb = find_northbridge();
58 if (nb < 0) 64 if (nb < 0)
59 return nb; 65 return nb;
60 66
61 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 67 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
62
63 num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
64 printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
65 68
66 reg = read_pci_config(0, nb, 0, 0x60); 69 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1; 70 numnodes = ((reg >> 4) & 0xF) + 1;
68 if (numnodes <= 1) 71 if (numnodes <= 1)
69 return -1; 72 return -1;
70 73
71 printk(KERN_INFO "Number of nodes %d\n", numnodes); 74 printk(KERN_INFO "Number of nodes %d\n", numnodes);
72 75
73 memset(&nodes,0,sizeof(nodes)); 76 memset(&nodes, 0, sizeof(nodes));
74 prevbase = 0; 77 prevbase = 0;
75 for (i = 0; i < 8; i++) { 78 for (i = 0; i < 8; i++) {
76 unsigned long base,limit; 79 unsigned long base, limit;
77 u32 nodeid; 80 u32 nodeid;
78 81
79 base = read_pci_config(0, nb, 1, 0x40 + i*8); 82 base = read_pci_config(0, nb, 1, 0x40 + i*8);
80 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 83 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
81 84
82 nodeid = limit & 7; 85 nodeid = limit & 7;
83 nodeids[i] = nodeid; 86 nodeids[i] = nodeid;
84 if ((base & 3) == 0) { 87 if ((base & 3) == 0) {
85 if (i < numnodes) 88 if (i < numnodes)
86 printk("Skipping disabled node %d\n", i); 89 printk("Skipping disabled node %d\n", i);
87 continue; 90 continue;
88 } 91 }
89 if (nodeid >= numnodes) { 92 if (nodeid >= numnodes) {
90 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 93 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
91 base, limit); 94 base, limit);
92 continue; 95 continue;
93 } 96 }
94 97
95 if (!limit) { 98 if (!limit) {
96 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, 99 printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
97 base); 100 i, base);
98 continue; 101 continue;
99 } 102 }
100 if ((base >> 8) & 3 || (limit >> 8) & 3) { 103 if ((base >> 8) & 3 || (limit >> 8) & 3) {
101 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 104 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
102 nodeid, (base>>8)&3, (limit>>8) & 3); 105 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1; 106 return -1;
104 } 107 }
105 if (node_isset(nodeid, node_possible_map)) { 108 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n", 109 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid); 110 nodeid);
108 continue; 111 continue;
109 } 112 }
110 113
111 limit >>= 16; 114 limit >>= 16;
112 limit <<= 24; 115 limit <<= 24;
113 limit |= (1<<24)-1; 116 limit |= (1<<24)-1;
114 limit++; 117 limit++;
115 118
116 if (limit > end_pfn << PAGE_SHIFT) 119 if (limit > end_pfn << PAGE_SHIFT)
117 limit = end_pfn << PAGE_SHIFT; 120 limit = end_pfn << PAGE_SHIFT;
118 if (limit <= base) 121 if (limit <= base)
119 continue; 122 continue;
120 123
121 base >>= 16; 124 base >>= 16;
122 base <<= 24; 125 base <<= 24;
123 126
124 if (base < start) 127 if (base < start)
125 base = start; 128 base = start;
126 if (limit > end) 129 if (limit > end)
127 limit = end; 130 limit = end;
128 if (limit == base) { 131 if (limit == base) {
129 printk(KERN_ERR "Empty node %d\n", nodeid); 132 printk(KERN_ERR "Empty node %d\n", nodeid);
130 continue; 133 continue;
131 } 134 }
132 if (limit < base) { 135 if (limit < base) {
133 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 136 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
134 nodeid, base, limit); 137 nodeid, base, limit);
135 continue; 138 continue;
136 } 139 }
137 140
138 /* Could sort here, but pun for now. Should not happen anyroads. */ 141 /* Could sort here, but pun for now. Should not happen anyroads. */
139 if (prevbase > base) { 142 if (prevbase > base) {
140 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 143 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
141 prevbase,base); 144 prevbase, base);
142 return -1; 145 return -1;
143 } 146 }
144 147
145 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 148 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
146 nodeid, base, limit); 149 nodeid, base, limit);
147 150
148 found++; 151 found++;
149 152
150 nodes[nodeid].start = base; 153 nodes[nodeid].start = base;
151 nodes[nodeid].end = limit; 154 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid, 155 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT, 156 nodes[nodeid].start >> PAGE_SHIFT,
@@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
156 prevbase = base; 159 prevbase = base;
157 160
158 node_set(nodeid, node_possible_map); 161 node_set(nodeid, node_possible_map);
159 } 162 }
160 163
161 if (!found) 164 if (!found)
162 return -1; 165 return -1;
163 166
164 memnode_shift = compute_hash_shift(nodes, 8); 167 memnode_shift = compute_hash_shift(nodes, 8);
165 if (memnode_shift < 0) { 168 if (memnode_shift < 0) {
166 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 169 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
167 return -1; 170 return -1;
168 } 171 }
169 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 172 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
173
174 /* use the coreid bits from early_identify_cpu */
175 bits = boot_cpu_data.x86_coreid_bits;
176 cores = (1<<bits);
170 177
171 for (i = 0; i < 8; i++) { 178 for (i = 0; i < 8; i++) {
172 if (nodes[i].start != nodes[i].end) { 179 if (nodes[i].start != nodes[i].end) {
173 nodeid = nodeids[i]; 180 nodeid = nodeids[i];
174 for (j = 0; j < num_cores; j++) 181 for (j = 0; j < cores; j++)
175 apicid_to_node[(nodeid * num_cores) + j] = i; 182 apicid_to_node[(nodeid << bits) + j] = i;
176 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 183 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
177 } 184 }
178 } 185 }
179 186
180 numa_init_array(); 187 numa_init_array();
181 return 0; 188 return 0;
182} 189}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c
index 552e08473755..56fe7124fbec 100644
--- a/arch/x86/mm/mmap_32.c
+++ b/arch/x86/mm/mmap.c
@@ -1,10 +1,13 @@
1/* 1/*
2 * linux/arch/i386/mm/mmap.c 2 * Flexible mmap layout support
3 * 3 *
4 * flexible mmap layout support 4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
5 * as follows:
5 * 6 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 7 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved. 8 * All Rights Reserved.
9 * Copyright 2005 Andi Kleen, SUSE Labs.
10 * Copyright 2007 Jiri Kosina, SUSE Labs.
8 * 11 *
9 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 13 * it under the terms of the GNU General Public License as published by
@@ -19,14 +22,12 @@
19 * You should have received a copy of the GNU General Public License 22 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software 23 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */ 25 */
26 26
27#include <linux/personality.h> 27#include <linux/personality.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/random.h> 29#include <linux/random.h>
30#include <linux/limits.h>
30#include <linux/sched.h> 31#include <linux/sched.h>
31 32
32/* 33/*
@@ -37,20 +38,71 @@
37#define MIN_GAP (128*1024*1024) 38#define MIN_GAP (128*1024*1024)
38#define MAX_GAP (TASK_SIZE/6*5) 39#define MAX_GAP (TASK_SIZE/6*5)
39 40
40static inline unsigned long mmap_base(struct mm_struct *mm) 41/*
42 * True on X86_32 or when emulating IA32 on X86_64
43 */
44static int mmap_is_ia32(void)
41{ 45{
42 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 46#ifdef CONFIG_X86_32
43 unsigned long random_factor = 0; 47 return 1;
48#endif
49#ifdef CONFIG_IA32_EMULATION
50 if (test_thread_flag(TIF_IA32))
51 return 1;
52#endif
53 return 0;
54}
44 55
45 if (current->flags & PF_RANDOMIZE) 56static int mmap_is_legacy(void)
46 random_factor = get_random_int() % (1024*1024); 57{
58 if (current->personality & ADDR_COMPAT_LAYOUT)
59 return 1;
60
61 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
62 return 1;
63
64 return sysctl_legacy_va_layout;
65}
66
67static unsigned long mmap_rnd(void)
68{
69 unsigned long rnd = 0;
70
71 /*
72 * 8 bits of randomness in 32bit mmaps, 20 address space bits
73 * 28 bits of randomness in 64bit mmaps, 40 address space bits
74 */
75 if (current->flags & PF_RANDOMIZE) {
76 if (mmap_is_ia32())
77 rnd = (long)get_random_int() % (1<<8);
78 else
79 rnd = (long)(get_random_int() % (1<<28));
80 }
81 return rnd << PAGE_SHIFT;
82}
83
84static unsigned long mmap_base(void)
85{
86 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
47 87
48 if (gap < MIN_GAP) 88 if (gap < MIN_GAP)
49 gap = MIN_GAP; 89 gap = MIN_GAP;
50 else if (gap > MAX_GAP) 90 else if (gap > MAX_GAP)
51 gap = MAX_GAP; 91 gap = MAX_GAP;
52 92
53 return PAGE_ALIGN(TASK_SIZE - gap - random_factor); 93 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
94}
95
96/*
97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
98 * does, but not when emulating X86_32
99 */
100static unsigned long mmap_legacy_base(void)
101{
102 if (mmap_is_ia32())
103 return TASK_UNMAPPED_BASE;
104 else
105 return TASK_UNMAPPED_BASE + mmap_rnd();
54} 106}
55 107
56/* 108/*
@@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm)
59 */ 111 */
60void arch_pick_mmap_layout(struct mm_struct *mm) 112void arch_pick_mmap_layout(struct mm_struct *mm)
61{ 113{
62 /* 114 if (mmap_is_legacy()) {
63 * Fall back to the standard layout if the personality 115 mm->mmap_base = mmap_legacy_base();
64 * bit is set, or if the expected stack growth is unlimited:
65 */
66 if (sysctl_legacy_va_layout ||
67 (current->personality & ADDR_COMPAT_LAYOUT) ||
68 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
69 mm->mmap_base = TASK_UNMAPPED_BASE;
70 mm->get_unmapped_area = arch_get_unmapped_area; 116 mm->get_unmapped_area = arch_get_unmapped_area;
71 mm->unmap_area = arch_unmap_area; 117 mm->unmap_area = arch_unmap_area;
72 } else { 118 } else {
73 mm->mmap_base = mmap_base(mm); 119 mm->mmap_base = mmap_base();
74 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 120 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
75 mm->unmap_area = arch_unmap_area_topdown; 121 mm->unmap_area = arch_unmap_area_topdown;
76 } 122 }
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
deleted file mode 100644
index 80bba0dc000e..000000000000
--- a/arch/x86/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
1/* Copyright 2005 Andi Kleen, SuSE Labs.
2 * Licensed under GPL, v.2
3 */
4#include <linux/mm.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7#include <asm/ia32.h>
8
9/* Notebook: move the mmap code from sys_x86_64.c over here. */
10
11void arch_pick_mmap_layout(struct mm_struct *mm)
12{
13#ifdef CONFIG_IA32_EMULATION
14 if (current_thread_info()->flags & _TIF_IA32)
15 return ia32_pick_mmap_layout(mm);
16#endif
17 mm->mmap_base = TASK_UNMAPPED_BASE;
18 if (current->flags & PF_RANDOMIZE) {
19 /* Add 28bit randomness which is about 40bits of address space
20 because mmap base has to be page aligned.
21 or ~1/128 of the total user VM
22 (total user address space is 47bits) */
23 unsigned rnd = get_random_int() & 0xfffffff;
24 mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
25 }
26 mm->get_unmapped_area = arch_get_unmapped_area;
27 mm->unmap_area = arch_unmap_area;
28}
29
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d6926ba8995..dc3b1f7e1451 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/string.h> 7#include <linux/string.h>
@@ -11,35 +11,45 @@
11#include <linux/ctype.h> 11#include <linux/ctype.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/sched.h>
14 15
15#include <asm/e820.h> 16#include <asm/e820.h>
16#include <asm/proto.h> 17#include <asm/proto.h>
17#include <asm/dma.h> 18#include <asm/dma.h>
18#include <asm/numa.h> 19#include <asm/numa.h>
19#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h>
20 22
21#ifndef Dprintk 23#ifndef Dprintk
22#define Dprintk(x...) 24#define Dprintk(x...)
23#endif 25#endif
24 26
25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data);
29
26bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27 31
28struct memnode memnode; 32struct memnode memnode;
29 33
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = { 34int x86_cpu_to_node_map_init[NR_CPUS] = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE 35 [0 ... NR_CPUS-1] = NUMA_NO_NODE
32}; 36};
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 37void *x86_cpu_to_node_map_early_ptr;
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 38DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
40EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
41
42s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
43 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35}; 44};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 45
46cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
47EXPORT_SYMBOL(node_to_cpumask_map);
37 48
38int numa_off __initdata; 49int numa_off __initdata;
39unsigned long __initdata nodemap_addr; 50unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size; 51unsigned long __initdata nodemap_size;
41 52
42
43/* 53/*
44 * Given a shift value, try to populate memnodemap[] 54 * Given a shift value, try to populate memnodemap[]
45 * Returns : 55 * Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
47 * 0 if memnodmap[] too small (of shift too small) 57 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big) 58 * -1 if node overlap or lost ram (shift too big)
49 */ 59 */
50static int __init 60static int __init populate_memnodemap(const struct bootnode *nodes,
51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) 61 int numnodes, int shift)
52{ 62{
53 int i;
54 int res = -1;
55 unsigned long addr, end; 63 unsigned long addr, end;
64 int i, res = -1;
56 65
57 memset(memnodemap, 0xff, memnodemapsize); 66 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
58 for (i = 0; i < numnodes; i++) { 67 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 68 addr = nodes[i].start;
60 end = nodes[i].end; 69 end = nodes[i].end;
@@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
63 if ((end >> shift) >= memnodemapsize) 72 if ((end >> shift) >= memnodemapsize)
64 return 0; 73 return 0;
65 do { 74 do {
66 if (memnodemap[addr >> shift] != 0xff) 75 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
67 return -1; 76 return -1;
68 memnodemap[addr >> shift] = i; 77 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 78 addr += (1UL << shift);
70 } while (addr < end); 79 } while (addr < end);
71 res = 1; 80 res = 1;
72 } 81 }
73 return res; 82 return res;
74} 83}
75 84
@@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void)
78 unsigned long pad, pad_addr; 87 unsigned long pad, pad_addr;
79 88
80 memnodemap = memnode.embedded_map; 89 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48) 90 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
82 return 0; 91 return 0;
83 92
84 pad = L1_CACHE_BYTES - 1; 93 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000; 94 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize; 95 nodemap_size = pad + sizeof(s16) * memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, 96 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size); 97 nodemap_size);
89 if (nodemap_addr == -1UL) { 98 if (nodemap_addr == -1UL) {
@@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void)
94 } 103 }
95 pad_addr = (nodemap_addr + pad) & ~pad; 104 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr); 105 memnodemap = phys_to_virt(pad_addr);
106 reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
97 107
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 108 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size); 109 nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void)
104 * The LSB of all start and end addresses in the node map is the value of the 114 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift. 115 * maximum possible shift.
106 */ 116 */
107static int __init 117static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) 118 int numnodes)
109{ 119{
110 int i, nodes_used = 0; 120 int i, nodes_used = 0;
111 unsigned long start, end; 121 unsigned long start, end;
@@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
140 shift); 150 shift);
141 151
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) { 152 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO 153 printk(KERN_INFO "Your memory is not aligned you need to "
144 "Your memory is not aligned you need to rebuild your kernel " 154 "rebuild your kernel with a bigger NODEMAPSIZE "
145 "with a bigger NODEMAPSIZE shift=%d\n", 155 "shift=%d\n", shift);
146 shift);
147 return -1; 156 return -1;
148 } 157 }
149 return shift; 158 return shift;
150} 159}
151 160
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn) 161int early_pfn_to_nid(unsigned long pfn)
154{ 162{
155 return phys_to_nid(pfn << PAGE_SHIFT); 163 return phys_to_nid(pfn << PAGE_SHIFT);
156} 164}
157#endif
158 165
159static void * __init 166static void * __init early_node_mem(int nodeid, unsigned long start,
160early_node_mem(int nodeid, unsigned long start, unsigned long end, 167 unsigned long end, unsigned long size)
161 unsigned long size)
162{ 168{
163 unsigned long mem = find_e820_area(start, end, size); 169 unsigned long mem = find_e820_area(start, end, size);
164 void *ptr; 170 void *ptr;
171
165 if (mem != -1L) 172 if (mem != -1L)
166 return __va(mem); 173 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size, 174 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); 175 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 if (ptr == NULL) { 176 if (ptr == NULL) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 177 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid); 178 size, nodeid);
172 return NULL; 179 return NULL;
173 } 180 }
174 return ptr; 181 return ptr;
175} 182}
176 183
177/* Initialize bootmem allocator for a node */ 184/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 185void __init setup_node_bootmem(int nodeid, unsigned long start,
179{ 186 unsigned long end)
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 187{
181 unsigned long nodedata_phys; 188 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
189 unsigned long bootmap_start, nodedata_phys;
182 void *bootmap; 190 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 191 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184 192
185 start = round_up(start, ZONE_ALIGN); 193 start = round_up(start, ZONE_ALIGN);
186 194
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 195 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
196 start, end);
188 197
189 start_pfn = start >> PAGE_SHIFT; 198 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT; 199 end_pfn = end >> PAGE_SHIFT;
@@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 209 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201 210
202 /* Find a place for the bootmem map */ 211 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 212 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205 bootmap = early_node_mem(nodeid, bootmap_start, end, 214 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT); 215 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) { 216 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end) 217 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size); 218 free_bootmem((unsigned long)node_data[nodeid],
219 pgdat_size);
210 node_data[nodeid] = NULL; 220 node_data[nodeid] = NULL;
211 return; 221 return;
212 } 222 }
213 bootmap_start = __pa(bootmap); 223 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 224 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
215 225
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 226 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT, 227 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn); 228 start_pfn, end_pfn);
219 229
220 free_bootmem_with_active_regions(nodeid, end); 230 free_bootmem_with_active_regions(nodeid, end);
221 231
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 232 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 233 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
234 bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA 235#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid); 236 srat_reserve_add_area(nodeid);
226#endif 237#endif
227 node_set_online(nodeid); 238 node_set_online(nodeid);
228} 239}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
237
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
240
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251#endif
252}
253 240
241/*
242 * There are unfortunately some poorly designed mainboards around that
243 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
244 * mapping. To avoid this fill in the mapping for all possible CPUs,
245 * as the number of CPUs is not known yet. We round robin the existing
246 * nodes.
247 */
254void __init numa_init_array(void) 248void __init numa_init_array(void)
255{ 249{
256 int rr, i; 250 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around 251
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map); 252 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) { 253 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node(i) != NUMA_NO_NODE) 254 if (early_cpu_to_node(i) != NUMA_NO_NODE)
265 continue; 255 continue;
266 numa_set_node(i, rr); 256 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map); 257 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES) 258 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map); 259 rr = first_node(node_online_map);
270 } 260 }
271
272} 261}
273 262
274#ifdef CONFIG_NUMA_EMU 263#ifdef CONFIG_NUMA_EMU
@@ -276,15 +265,17 @@ void __init numa_init_array(void)
276char *cmdline __initdata; 265char *cmdline __initdata;
277 266
278/* 267/*
279 * Setups up nid to range from addr to addr + size. If the end boundary is 268 * Setups up nid to range from addr to addr + size. If the end
280 * greater than max_addr, then max_addr is used instead. The return value is 0 269 * boundary is greater than max_addr, then max_addr is used instead.
281 * if there is additional memory left for allocation past addr and -1 otherwise. 270 * The return value is 0 if there is additional memory left for
282 * addr is adjusted to be at the end of the node. 271 * allocation past addr and -1 otherwise. addr is adjusted to be at
272 * the end of the node.
283 */ 273 */
284static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 274static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr) 275 u64 size, u64 max_addr)
286{ 276{
287 int ret = 0; 277 int ret = 0;
278
288 nodes[nid].start = *addr; 279 nodes[nid].start = *addr;
289 *addr += size; 280 *addr += size;
290 if (*addr >= max_addr) { 281 if (*addr >= max_addr) {
@@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
335 326
336 for (i = node_start; i < num_nodes + node_start; i++) { 327 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size; 328 u64 end = *addr + size;
329
338 if (i < big) 330 if (i < big)
339 end += FAKE_NODE_MIN_SIZE; 331 end += FAKE_NODE_MIN_SIZE;
340 /* 332 /*
@@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
380static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 372static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
381{ 373{
382 struct bootnode nodes[MAX_NUMNODES]; 374 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT; 375 u64 size, addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT; 376 u64 max_addr = end_pfn << PAGE_SHIFT;
385 int num_nodes = 0; 377 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
386 int coeff_flag;
387 int coeff = -1;
388 int num = 0;
389 u64 size;
390 int i;
391 378
392 memset(&nodes, 0, sizeof(nodes)); 379 memset(&nodes, 0, sizeof(nodes));
393 /* 380 /*
@@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
395 * system RAM into N fake nodes. 382 * system RAM into N fake nodes.
396 */ 383 */
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 384 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, 385 long n = simple_strtol(cmdline, NULL, 0);
399 simple_strtol(cmdline, NULL, 0)); 386
387 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
400 if (num_nodes < 0) 388 if (num_nodes < 0)
401 return num_nodes; 389 return num_nodes;
402 goto out; 390 goto out;
@@ -483,46 +471,47 @@ out:
483 for_each_node_mask(i, node_possible_map) { 471 for_each_node_mask(i, node_possible_map) {
484 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 472 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
485 nodes[i].end >> PAGE_SHIFT); 473 nodes[i].end >> PAGE_SHIFT);
486 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 474 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
487 } 475 }
488 acpi_fake_nodes(nodes, num_nodes); 476 acpi_fake_nodes(nodes, num_nodes);
489 numa_init_array(); 477 numa_init_array();
490 return 0; 478 return 0;
491} 479}
492#endif /* CONFIG_NUMA_EMU */ 480#endif /* CONFIG_NUMA_EMU */
493 481
494void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 482void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
495{ 483{
496 int i; 484 int i;
497 485
498 nodes_clear(node_possible_map); 486 nodes_clear(node_possible_map);
499 487
500#ifdef CONFIG_NUMA_EMU 488#ifdef CONFIG_NUMA_EMU
501 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 489 if (cmdline && !numa_emulation(start_pfn, end_pfn))
502 return; 490 return;
503 nodes_clear(node_possible_map); 491 nodes_clear(node_possible_map);
504#endif 492#endif
505 493
506#ifdef CONFIG_ACPI_NUMA 494#ifdef CONFIG_ACPI_NUMA
507 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 495 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
508 end_pfn << PAGE_SHIFT)) 496 end_pfn << PAGE_SHIFT))
509 return; 497 return;
510 nodes_clear(node_possible_map); 498 nodes_clear(node_possible_map);
511#endif 499#endif
512 500
513#ifdef CONFIG_K8_NUMA 501#ifdef CONFIG_K8_NUMA
514 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 502 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
503 end_pfn<<PAGE_SHIFT))
515 return; 504 return;
516 nodes_clear(node_possible_map); 505 nodes_clear(node_possible_map);
517#endif 506#endif
518 printk(KERN_INFO "%s\n", 507 printk(KERN_INFO "%s\n",
519 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 508 numa_off ? "NUMA turned off" : "No NUMA configuration found");
520 509
521 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 510 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
522 start_pfn << PAGE_SHIFT, 511 start_pfn << PAGE_SHIFT,
523 end_pfn << PAGE_SHIFT); 512 end_pfn << PAGE_SHIFT);
524 /* setup dummy node covering all memory */ 513 /* setup dummy node covering all memory */
525 memnode_shift = 63; 514 memnode_shift = 63;
526 memnodemap = memnode.embedded_map; 515 memnodemap = memnode.embedded_map;
527 memnodemap[0] = 0; 516 memnodemap[0] = 0;
528 nodes_clear(node_online_map); 517 nodes_clear(node_online_map);
@@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
530 node_set(0, node_possible_map); 519 node_set(0, node_possible_map);
531 for (i = 0; i < NR_CPUS; i++) 520 for (i = 0; i < NR_CPUS; i++)
532 numa_set_node(i, 0); 521 numa_set_node(i, 0);
533 node_to_cpumask[0] = cpumask_of_cpu(0); 522 /* cpumask_of_cpu() may not be available during early startup */
523 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
524 cpu_set(0, node_to_cpumask_map[0]);
534 e820_register_active_regions(0, start_pfn, end_pfn); 525 e820_register_active_regions(0, start_pfn, end_pfn);
535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 526 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
536} 527}
537 528
538__cpuinit void numa_add_cpu(int cpu) 529__cpuinit void numa_add_cpu(int cpu)
539{ 530{
540 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 531 set_bit(cpu,
541} 532 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
533}
542 534
543void __cpuinit numa_set_node(int cpu, int node) 535void __cpuinit numa_set_node(int cpu, int node)
544{ 536{
537 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
538
545 cpu_pda(cpu)->nodenumber = node; 539 cpu_pda(cpu)->nodenumber = node;
546 cpu_to_node(cpu) = node; 540
541 if(cpu_to_node_map)
542 cpu_to_node_map[cpu] = node;
543 else if(per_cpu_offset(cpu))
544 per_cpu(x86_cpu_to_node_map, cpu) = node;
545 else
546 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
547} 547}
548 548
549unsigned long __init numa_free_all_bootmem(void) 549unsigned long __init numa_free_all_bootmem(void)
550{ 550{
551 int i;
552 unsigned long pages = 0; 551 unsigned long pages = 0;
553 for_each_online_node(i) { 552 int i;
553
554 for_each_online_node(i)
554 pages += free_all_bootmem_node(NODE_DATA(i)); 555 pages += free_all_bootmem_node(NODE_DATA(i));
555 } 556
556 return pages; 557 return pages;
557} 558}
558 559
559void __init paging_init(void) 560void __init paging_init(void)
560{ 561{
561 int i;
562 unsigned long max_zone_pfns[MAX_NR_ZONES]; 562 unsigned long max_zone_pfns[MAX_NR_ZONES];
563
563 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 564 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
564 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 565 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
565 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 566 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +569,27 @@ void __init paging_init(void)
568 sparse_memory_present_with_active_regions(MAX_NUMNODES); 569 sparse_memory_present_with_active_regions(MAX_NUMNODES);
569 sparse_init(); 570 sparse_init();
570 571
571 for_each_online_node(i) {
572 setup_node_zones(i);
573 }
574
575 free_area_init_nodes(max_zone_pfns); 572 free_area_init_nodes(max_zone_pfns);
576} 573}
577 574
578static __init int numa_setup(char *opt) 575static __init int numa_setup(char *opt)
579{ 576{
580 if (!opt) 577 if (!opt)
581 return -EINVAL; 578 return -EINVAL;
582 if (!strncmp(opt,"off",3)) 579 if (!strncmp(opt, "off", 3))
583 numa_off = 1; 580 numa_off = 1;
584#ifdef CONFIG_NUMA_EMU 581#ifdef CONFIG_NUMA_EMU
585 if (!strncmp(opt, "fake=", 5)) 582 if (!strncmp(opt, "fake=", 5))
586 cmdline = opt + 5; 583 cmdline = opt + 5;
587#endif 584#endif
588#ifdef CONFIG_ACPI_NUMA 585#ifdef CONFIG_ACPI_NUMA
589 if (!strncmp(opt,"noacpi",6)) 586 if (!strncmp(opt, "noacpi", 6))
590 acpi_numa = -1; 587 acpi_numa = -1;
591 if (!strncmp(opt,"hotadd=", 7)) 588 if (!strncmp(opt, "hotadd=", 7))
592 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 589 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593#endif 590#endif
594 return 0; 591 return 0;
595} 592}
596
597early_param("numa", numa_setup); 593early_param("numa", numa_setup);
598 594
599/* 595/*
@@ -611,38 +607,16 @@ early_param("numa", numa_setup);
611void __init init_cpu_to_node(void) 607void __init init_cpu_to_node(void)
612{ 608{
613 int i; 609 int i;
614 for (i = 0; i < NR_CPUS; i++) { 610
615 u8 apicid = x86_cpu_to_apicid_init[i]; 611 for (i = 0; i < NR_CPUS; i++) {
612 u16 apicid = x86_cpu_to_apicid_init[i];
613
616 if (apicid == BAD_APICID) 614 if (apicid == BAD_APICID)
617 continue; 615 continue;
618 if (apicid_to_node[apicid] == NUMA_NO_NODE) 616 if (apicid_to_node[apicid] == NUMA_NO_NODE)
619 continue; 617 continue;
620 numa_set_node(i,apicid_to_node[apicid]); 618 numa_set_node(i, apicid_to_node[apicid]);
621 } 619 }
622} 620}
623 621
624EXPORT_SYMBOL(cpu_to_node);
625EXPORT_SYMBOL(node_to_cpumask);
626EXPORT_SYMBOL(memnode);
627EXPORT_SYMBOL(node_data);
628
629#ifdef CONFIG_DISCONTIGMEM
630/*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636 622
637int pfn_valid(unsigned long pfn)
638{
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646}
647EXPORT_SYMBOL(pfn_valid);
648#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 000000000000..06353d43f72e
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,224 @@
1/*
2 * self test for change_page_attr.
3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts
5 * and compares page tables forwards and afterwards.
6 */
7#include <linux/bootmem.h>
8#include <linux/random.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/mm.h>
12
13#include <asm/cacheflush.h>
14#include <asm/pgtable.h>
15#include <asm/kdebug.h>
16
17enum {
18 NTEST = 4000,
19#ifdef CONFIG_X86_64
20 LPS = (1 << PMD_SHIFT),
21#elif defined(CONFIG_X86_PAE)
22 LPS = (1 << PMD_SHIFT),
23#else
24 LPS = (1 << 22),
25#endif
26 GPS = (1<<30)
27};
28
29struct split_state {
30 long lpg, gpg, spg, exec;
31 long min_exec, max_exec;
32};
33
34static __init int print_split(struct split_state *s)
35{
36 long i, expected, missed = 0;
37 int printed = 0;
38 int err = 0;
39
40 s->lpg = s->gpg = s->spg = s->exec = 0;
41 s->min_exec = ~0UL;
42 s->max_exec = 0;
43 for (i = 0; i < max_pfn_mapped; ) {
44 unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
45 int level;
46 pte_t *pte;
47
48 pte = lookup_address(addr, &level);
49 if (!pte) {
50 if (!printed) {
51 dump_pagetable(addr);
52 printk(KERN_INFO "CPA %lx no pte level %d\n",
53 addr, level);
54 printed = 1;
55 }
56 missed++;
57 i++;
58 continue;
59 }
60
61 if (level == PG_LEVEL_1G && sizeof(long) == 8) {
62 s->gpg++;
63 i += GPS/PAGE_SIZE;
64 } else if (level == PG_LEVEL_2M) {
65 if (!(pte_val(*pte) & _PAGE_PSE)) {
66 printk(KERN_ERR
67 "%lx level %d but not PSE %Lx\n",
68 addr, level, (u64)pte_val(*pte));
69 err = 1;
70 }
71 s->lpg++;
72 i += LPS/PAGE_SIZE;
73 } else {
74 s->spg++;
75 i++;
76 }
77 if (!(pte_val(*pte) & _PAGE_NX)) {
78 s->exec++;
79 if (addr < s->min_exec)
80 s->min_exec = addr;
81 if (addr > s->max_exec)
82 s->max_exec = addr;
83 }
84 }
85 printk(KERN_INFO
86 "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
87 s->spg, s->lpg, s->gpg, s->exec,
88 s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
89
90 expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
91 if (expected != i) {
92 printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
93 max_pfn_mapped, expected);
94 return 1;
95 }
96 return err;
97}
98
99static unsigned long __initdata addr[NTEST];
100static unsigned int __initdata len[NTEST];
101
102/* Change the global bit on random pages in the direct mapping */
103static __init int exercise_pageattr(void)
104{
105 struct split_state sa, sb, sc;
106 unsigned long *bm;
107 pte_t *pte, pte0;
108 int failed = 0;
109 int level;
110 int i, k;
111 int err;
112
113 printk(KERN_INFO "CPA exercising pageattr\n");
114
115 bm = vmalloc((max_pfn_mapped + 7) / 8);
116 if (!bm) {
117 printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
118 return -ENOMEM;
119 }
120 memset(bm, 0, (max_pfn_mapped + 7) / 8);
121
122 failed += print_split(&sa);
123 srandom32(100);
124
125 for (i = 0; i < NTEST; i++) {
126 unsigned long pfn = random32() % max_pfn_mapped;
127
128 addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
129 len[i] = random32() % 100;
130 len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
131
132 if (len[i] == 0)
133 len[i] = 1;
134
135 pte = NULL;
136 pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
137
138 for (k = 0; k < len[i]; k++) {
139 pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
140 if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
141 addr[i] = 0;
142 break;
143 }
144 if (k == 0) {
145 pte0 = *pte;
146 } else {
147 if (pgprot_val(pte_pgprot(*pte)) !=
148 pgprot_val(pte_pgprot(pte0))) {
149 len[i] = k;
150 break;
151 }
152 }
153 if (test_bit(pfn + k, bm)) {
154 len[i] = k;
155 break;
156 }
157 __set_bit(pfn + k, bm);
158 }
159 if (!addr[i] || !pte || !k) {
160 addr[i] = 0;
161 continue;
162 }
163
164 err = change_page_attr_clear(addr[i], len[i],
165 __pgprot(_PAGE_GLOBAL));
166 if (err < 0) {
167 printk(KERN_ERR "CPA %d failed %d\n", i, err);
168 failed++;
169 }
170
171 pte = lookup_address(addr[i], &level);
172 if (!pte || pte_global(*pte) || pte_huge(*pte)) {
173 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
174 pte ? (u64)pte_val(*pte) : 0ULL);
175 failed++;
176 }
177 if (level != PG_LEVEL_4K) {
178 printk(KERN_ERR "CPA %lx: unexpected level %d\n",
179 addr[i], level);
180 failed++;
181 }
182
183 }
184 vfree(bm);
185
186 failed += print_split(&sb);
187
188 printk(KERN_INFO "CPA reverting everything\n");
189 for (i = 0; i < NTEST; i++) {
190 if (!addr[i])
191 continue;
192 pte = lookup_address(addr[i], &level);
193 if (!pte) {
194 printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
195 failed++;
196 continue;
197 }
198 err = change_page_attr_set(addr[i], len[i],
199 __pgprot(_PAGE_GLOBAL));
200 if (err < 0) {
201 printk(KERN_ERR "CPA reverting failed: %d\n", err);
202 failed++;
203 }
204 pte = lookup_address(addr[i], &level);
205 if (!pte || !pte_global(*pte)) {
206 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
207 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
208 failed++;
209 }
210
211 }
212
213 failed += print_split(&sc);
214
215 if (failed) {
216 printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
217 WARN_ON(1);
218 } else {
219 printk(KERN_INFO "CPA selftests PASSED\n");
220 }
221
222 return 0;
223}
224module_init(exercise_pageattr);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 000000000000..1cc6607eacb0
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,564 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5#include <linux/highmem.h>
6#include <linux/bootmem.h>
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/mm.h>
11
12#include <asm/e820.h>
13#include <asm/processor.h>
14#include <asm/tlbflush.h>
15#include <asm/sections.h>
16#include <asm/uaccess.h>
17#include <asm/pgalloc.h>
18
19static inline int
20within(unsigned long addr, unsigned long start, unsigned long end)
21{
22 return addr >= start && addr < end;
23}
24
25/*
26 * Flushing functions
27 */
28
29/**
30 * clflush_cache_range - flush a cache range with clflush
31 * @addr: virtual start address
32 * @size: number of bytes to flush
33 *
34 * clflush is an unordered instruction which needs fencing with mfence
35 * to avoid ordering issues.
36 */
37void clflush_cache_range(void *vaddr, unsigned int size)
38{
39 void *vend = vaddr + size - 1;
40
41 mb();
42
43 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
44 clflush(vaddr);
45 /*
46 * Flush any possible final partial cacheline:
47 */
48 clflush(vend);
49
50 mb();
51}
52
53static void __cpa_flush_all(void *arg)
54{
55 /*
56 * Flush all to work around Errata in early athlons regarding
57 * large page flushing.
58 */
59 __flush_tlb_all();
60
61 if (boot_cpu_data.x86_model >= 4)
62 wbinvd();
63}
64
65static void cpa_flush_all(void)
66{
67 BUG_ON(irqs_disabled());
68
69 on_each_cpu(__cpa_flush_all, NULL, 1, 1);
70}
71
72static void __cpa_flush_range(void *arg)
73{
74 /*
75 * We could optimize that further and do individual per page
76 * tlb invalidates for a low number of pages. Caveat: we must
77 * flush the high aliases on 64bit as well.
78 */
79 __flush_tlb_all();
80}
81
82static void cpa_flush_range(unsigned long start, int numpages)
83{
84 unsigned int i, level;
85 unsigned long addr;
86
87 BUG_ON(irqs_disabled());
88 WARN_ON(PAGE_ALIGN(start) != start);
89
90 on_each_cpu(__cpa_flush_range, NULL, 1, 1);
91
92 /*
93 * We only need to flush on one CPU,
94 * clflush is a MESI-coherent instruction that
95 * will cause all other CPUs to flush the same
96 * cachelines:
97 */
98 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
99 pte_t *pte = lookup_address(addr, &level);
100
101 /*
102 * Only flush present addresses:
103 */
104 if (pte && pte_present(*pte))
105 clflush_cache_range((void *) addr, PAGE_SIZE);
106 }
107}
108
109/*
110 * Certain areas of memory on x86 require very specific protection flags,
111 * for example the BIOS area or kernel text. Callers don't always get this
112 * right (again, ioremap() on BIOS memory is not uncommon) so this function
113 * checks and fixes these known static required protection bits.
114 */
115static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
116{
117 pgprot_t forbidden = __pgprot(0);
118
119 /*
120 * The BIOS area between 640k and 1Mb needs to be executable for
121 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
122 */
123 if (within(__pa(address), BIOS_BEGIN, BIOS_END))
124 pgprot_val(forbidden) |= _PAGE_NX;
125
126 /*
127 * The kernel text needs to be executable for obvious reasons
128 * Does not cover __inittext since that is gone later on
129 */
130 if (within(address, (unsigned long)_text, (unsigned long)_etext))
131 pgprot_val(forbidden) |= _PAGE_NX;
132
133#ifdef CONFIG_DEBUG_RODATA
134 /* The .rodata section needs to be read-only */
135 if (within(address, (unsigned long)__start_rodata,
136 (unsigned long)__end_rodata))
137 pgprot_val(forbidden) |= _PAGE_RW;
138#endif
139
140 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
141
142 return prot;
143}
144
145pte_t *lookup_address(unsigned long address, int *level)
146{
147 pgd_t *pgd = pgd_offset_k(address);
148 pud_t *pud;
149 pmd_t *pmd;
150
151 *level = PG_LEVEL_NONE;
152
153 if (pgd_none(*pgd))
154 return NULL;
155 pud = pud_offset(pgd, address);
156 if (pud_none(*pud))
157 return NULL;
158 pmd = pmd_offset(pud, address);
159 if (pmd_none(*pmd))
160 return NULL;
161
162 *level = PG_LEVEL_2M;
163 if (pmd_large(*pmd))
164 return (pte_t *)pmd;
165
166 *level = PG_LEVEL_4K;
167 return pte_offset_kernel(pmd, address);
168}
169
170static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
171{
172 /* change init_mm */
173 set_pte_atomic(kpte, pte);
174#ifdef CONFIG_X86_32
175 if (!SHARED_KERNEL_PMD) {
176 struct page *page;
177
178 list_for_each_entry(page, &pgd_list, lru) {
179 pgd_t *pgd;
180 pud_t *pud;
181 pmd_t *pmd;
182
183 pgd = (pgd_t *)page_address(page) + pgd_index(address);
184 pud = pud_offset(pgd, address);
185 pmd = pmd_offset(pud, address);
186 set_pte_atomic((pte_t *)pmd, pte);
187 }
188 }
189#endif
190}
191
192static int split_large_page(pte_t *kpte, unsigned long address)
193{
194 pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
195 gfp_t gfp_flags = GFP_KERNEL;
196 unsigned long flags;
197 unsigned long addr;
198 pte_t *pbase, *tmp;
199 struct page *base;
200 unsigned int i, level;
201
202#ifdef CONFIG_DEBUG_PAGEALLOC
203 gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
204 gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
205#endif
206 base = alloc_pages(gfp_flags, 0);
207 if (!base)
208 return -ENOMEM;
209
210 spin_lock_irqsave(&pgd_lock, flags);
211 /*
212 * Check for races, another CPU might have split this page
213 * up for us already:
214 */
215 tmp = lookup_address(address, &level);
216 if (tmp != kpte) {
217 WARN_ON_ONCE(1);
218 goto out_unlock;
219 }
220
221 address = __pa(address);
222 addr = address & LARGE_PAGE_MASK;
223 pbase = (pte_t *)page_address(base);
224#ifdef CONFIG_X86_32
225 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
226#endif
227
228 pgprot_val(ref_prot) &= ~_PAGE_NX;
229 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
230 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
231
232 /*
233 * Install the new, split up pagetable. Important detail here:
234 *
235 * On Intel the NX bit of all levels must be cleared to make a
236 * page executable. See section 4.13.2 of Intel 64 and IA-32
237 * Architectures Software Developer's Manual).
238 */
239 ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
240 __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
241 base = NULL;
242
243out_unlock:
244 spin_unlock_irqrestore(&pgd_lock, flags);
245
246 if (base)
247 __free_pages(base, 0);
248
249 return 0;
250}
251
252static int
253__change_page_attr(unsigned long address, unsigned long pfn,
254 pgprot_t mask_set, pgprot_t mask_clr)
255{
256 struct page *kpte_page;
257 int level, err = 0;
258 pte_t *kpte;
259
260#ifdef CONFIG_X86_32
261 BUG_ON(pfn > max_low_pfn);
262#endif
263
264repeat:
265 kpte = lookup_address(address, &level);
266 if (!kpte)
267 return -EINVAL;
268
269 kpte_page = virt_to_page(kpte);
270 BUG_ON(PageLRU(kpte_page));
271 BUG_ON(PageCompound(kpte_page));
272
273 if (level == PG_LEVEL_4K) {
274 pgprot_t new_prot = pte_pgprot(*kpte);
275 pte_t new_pte, old_pte = *kpte;
276
277 pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
278 pgprot_val(new_prot) |= pgprot_val(mask_set);
279
280 new_prot = static_protections(new_prot, address);
281
282 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
283 BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
284
285 set_pte_atomic(kpte, new_pte);
286 } else {
287 err = split_large_page(kpte, address);
288 if (!err)
289 goto repeat;
290 }
291 return err;
292}
293
294/**
295 * change_page_attr_addr - Change page table attributes in linear mapping
296 * @address: Virtual address in linear mapping.
297 * @prot: New page table attribute (PAGE_*)
298 *
299 * Change page attributes of a page in the direct mapping. This is a variant
300 * of change_page_attr() that also works on memory holes that do not have
301 * mem_map entry (pfn_valid() is false).
302 *
303 * See change_page_attr() documentation for more details.
304 *
305 * Modules and drivers should use the set_memory_* APIs instead.
306 */
307
308#define HIGH_MAP_START __START_KERNEL_map
309#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE)
310
311static int
312change_page_attr_addr(unsigned long address, pgprot_t mask_set,
313 pgprot_t mask_clr)
314{
315 unsigned long phys_addr = __pa(address);
316 unsigned long pfn = phys_addr >> PAGE_SHIFT;
317 int err;
318
319#ifdef CONFIG_X86_64
320 /*
321 * If we are inside the high mapped kernel range, then we
322 * fixup the low mapping first. __va() returns the virtual
323 * address in the linear mapping:
324 */
325 if (within(address, HIGH_MAP_START, HIGH_MAP_END))
326 address = (unsigned long) __va(phys_addr);
327#endif
328
329 err = __change_page_attr(address, pfn, mask_set, mask_clr);
330 if (err)
331 return err;
332
333#ifdef CONFIG_X86_64
334 /*
335 * If the physical address is inside the kernel map, we need
336 * to touch the high mapped kernel as well:
337 */
338 if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
339 /*
340 * Calc the high mapping address. See __phys_addr()
341 * for the non obvious details.
342 */
343 address = phys_addr + HIGH_MAP_START - phys_base;
344 /* Make sure the kernel mappings stay executable */
345 pgprot_val(mask_clr) |= _PAGE_NX;
346
347 /*
348 * Our high aliases are imprecise, because we check
349 * everything between 0 and KERNEL_TEXT_SIZE, so do
350 * not propagate lookup failures back to users:
351 */
352 __change_page_attr(address, pfn, mask_set, mask_clr);
353 }
354#endif
355 return err;
356}
357
358static int __change_page_attr_set_clr(unsigned long addr, int numpages,
359 pgprot_t mask_set, pgprot_t mask_clr)
360{
361 unsigned int i;
362 int ret;
363
364 for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
365 ret = change_page_attr_addr(addr, mask_set, mask_clr);
366 if (ret)
367 return ret;
368 }
369
370 return 0;
371}
372
373static int change_page_attr_set_clr(unsigned long addr, int numpages,
374 pgprot_t mask_set, pgprot_t mask_clr)
375{
376 int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
377 mask_clr);
378
379 /*
380 * On success we use clflush, when the CPU supports it to
381 * avoid the wbindv. If the CPU does not support it and in the
382 * error case we fall back to cpa_flush_all (which uses
383 * wbindv):
384 */
385 if (!ret && cpu_has_clflush)
386 cpa_flush_range(addr, numpages);
387 else
388 cpa_flush_all();
389
390 return ret;
391}
392
393static inline int change_page_attr_set(unsigned long addr, int numpages,
394 pgprot_t mask)
395{
396 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
397}
398
399static inline int change_page_attr_clear(unsigned long addr, int numpages,
400 pgprot_t mask)
401{
402 return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
403
404}
405
406int set_memory_uc(unsigned long addr, int numpages)
407{
408 return change_page_attr_set(addr, numpages,
409 __pgprot(_PAGE_PCD | _PAGE_PWT));
410}
411EXPORT_SYMBOL(set_memory_uc);
412
413int set_memory_wb(unsigned long addr, int numpages)
414{
415 return change_page_attr_clear(addr, numpages,
416 __pgprot(_PAGE_PCD | _PAGE_PWT));
417}
418EXPORT_SYMBOL(set_memory_wb);
419
420int set_memory_x(unsigned long addr, int numpages)
421{
422 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
423}
424EXPORT_SYMBOL(set_memory_x);
425
426int set_memory_nx(unsigned long addr, int numpages)
427{
428 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
429}
430EXPORT_SYMBOL(set_memory_nx);
431
432int set_memory_ro(unsigned long addr, int numpages)
433{
434 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
435}
436
437int set_memory_rw(unsigned long addr, int numpages)
438{
439 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
440}
441
442int set_memory_np(unsigned long addr, int numpages)
443{
444 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
445}
446
447int set_pages_uc(struct page *page, int numpages)
448{
449 unsigned long addr = (unsigned long)page_address(page);
450
451 return set_memory_uc(addr, numpages);
452}
453EXPORT_SYMBOL(set_pages_uc);
454
455int set_pages_wb(struct page *page, int numpages)
456{
457 unsigned long addr = (unsigned long)page_address(page);
458
459 return set_memory_wb(addr, numpages);
460}
461EXPORT_SYMBOL(set_pages_wb);
462
463int set_pages_x(struct page *page, int numpages)
464{
465 unsigned long addr = (unsigned long)page_address(page);
466
467 return set_memory_x(addr, numpages);
468}
469EXPORT_SYMBOL(set_pages_x);
470
471int set_pages_nx(struct page *page, int numpages)
472{
473 unsigned long addr = (unsigned long)page_address(page);
474
475 return set_memory_nx(addr, numpages);
476}
477EXPORT_SYMBOL(set_pages_nx);
478
479int set_pages_ro(struct page *page, int numpages)
480{
481 unsigned long addr = (unsigned long)page_address(page);
482
483 return set_memory_ro(addr, numpages);
484}
485
486int set_pages_rw(struct page *page, int numpages)
487{
488 unsigned long addr = (unsigned long)page_address(page);
489
490 return set_memory_rw(addr, numpages);
491}
492
493
494#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
495static inline int __change_page_attr_set(unsigned long addr, int numpages,
496 pgprot_t mask)
497{
498 return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
499}
500
501static inline int __change_page_attr_clear(unsigned long addr, int numpages,
502 pgprot_t mask)
503{
504 return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
505}
506#endif
507
508#ifdef CONFIG_DEBUG_PAGEALLOC
509
510static int __set_pages_p(struct page *page, int numpages)
511{
512 unsigned long addr = (unsigned long)page_address(page);
513
514 return __change_page_attr_set(addr, numpages,
515 __pgprot(_PAGE_PRESENT | _PAGE_RW));
516}
517
518static int __set_pages_np(struct page *page, int numpages)
519{
520 unsigned long addr = (unsigned long)page_address(page);
521
522 return __change_page_attr_clear(addr, numpages,
523 __pgprot(_PAGE_PRESENT));
524}
525
526void kernel_map_pages(struct page *page, int numpages, int enable)
527{
528 if (PageHighMem(page))
529 return;
530 if (!enable) {
531 debug_check_no_locks_freed(page_address(page),
532 numpages * PAGE_SIZE);
533 }
534
535 /*
536 * If page allocator is not up yet then do not call c_p_a():
537 */
538 if (!debug_pagealloc_enabled)
539 return;
540
541 /*
542 * The return value is ignored - the calls cannot fail,
543 * large pages are disabled at boot time:
544 */
545 if (enable)
546 __set_pages_p(page, numpages);
547 else
548 __set_pages_np(page, numpages);
549
550 /*
551 * We should perform an IPI and flush all tlbs,
552 * but that can deadlock->flush only current cpu:
553 */
554 __flush_tlb_all();
555}
556#endif
557
558/*
559 * The testcases use internal knowledge of the implementation that shouldn't
560 * be exposed to the rest of the kernel. Include these directly here.
561 */
562#ifdef CONFIG_CPA_DEBUG
563#include "pageattr-test.c"
564#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
deleted file mode 100644
index 260073c07600..000000000000
--- a/arch/x86/mm/pageattr_32.c
+++ /dev/null
@@ -1,278 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/pgalloc.h>
15#include <asm/sections.h>
16
17static DEFINE_SPINLOCK(cpa_lock);
18static struct list_head df_list = LIST_HEAD_INIT(df_list);
19
20
21pte_t *lookup_address(unsigned long address)
22{
23 pgd_t *pgd = pgd_offset_k(address);
24 pud_t *pud;
25 pmd_t *pmd;
26 if (pgd_none(*pgd))
27 return NULL;
28 pud = pud_offset(pgd, address);
29 if (pud_none(*pud))
30 return NULL;
31 pmd = pmd_offset(pud, address);
32 if (pmd_none(*pmd))
33 return NULL;
34 if (pmd_large(*pmd))
35 return (pte_t *)pmd;
36 return pte_offset_kernel(pmd, address);
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base;
45 pte_t *pbase;
46
47 spin_unlock_irq(&cpa_lock);
48 base = alloc_pages(GFP_KERNEL, 0);
49 spin_lock_irq(&cpa_lock);
50 if (!base)
51 return NULL;
52
53 /*
54 * page_private is used to track the number of entries in
55 * the page table page that have non standard attributes.
56 */
57 SetPagePrivate(base);
58 page_private(base) = 0;
59
60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
66 addr == address ? prot : ref_prot));
67 }
68 return base;
69}
70
71static void cache_flush_page(struct page *p)
72{
73 void *adr = page_address(p);
74 int i;
75 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
76 clflush(adr+i);
77}
78
79static void flush_kernel_map(void *arg)
80{
81 struct list_head *lh = (struct list_head *)arg;
82 struct page *p;
83
84 /* High level code is not ready for clflush yet */
85 if (0 && cpu_has_clflush) {
86 list_for_each_entry (p, lh, lru)
87 cache_flush_page(p);
88 } else if (boot_cpu_data.x86_model >= 4)
89 wbinvd();
90
91 /* Flush all to work around Errata in early athlons regarding
92 * large page flushing.
93 */
94 __flush_tlb_all();
95}
96
97static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
98{
99 struct page *page;
100 unsigned long flags;
101
102 set_pte_atomic(kpte, pte); /* change init_mm */
103 if (SHARED_KERNEL_PMD)
104 return;
105
106 spin_lock_irqsave(&pgd_lock, flags);
107 for (page = pgd_list; page; page = (struct page *)page->index) {
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pgd = (pgd_t *)page_address(page) + pgd_index(address);
112 pud = pud_offset(pgd, address);
113 pmd = pmd_offset(pud, address);
114 set_pte_atomic((pte_t *)pmd, pte);
115 }
116 spin_unlock_irqrestore(&pgd_lock, flags);
117}
118
119/*
120 * No more special protections in this 2/4MB area - revert to a
121 * large page again.
122 */
123static inline void revert_page(struct page *kpte_page, unsigned long address)
124{
125 pgprot_t ref_prot;
126 pte_t *linear;
127
128 ref_prot =
129 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
130 ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
131
132 linear = (pte_t *)
133 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
134 set_pmd_pte(linear, address,
135 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
136 ref_prot));
137}
138
139static inline void save_page(struct page *kpte_page)
140{
141 if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
142 list_add(&kpte_page->lru, &df_list);
143}
144
145static int
146__change_page_attr(struct page *page, pgprot_t prot)
147{
148 pte_t *kpte;
149 unsigned long address;
150 struct page *kpte_page;
151
152 BUG_ON(PageHighMem(page));
153 address = (unsigned long)page_address(page);
154
155 kpte = lookup_address(address);
156 if (!kpte)
157 return -EINVAL;
158 kpte_page = virt_to_page(kpte);
159 BUG_ON(PageLRU(kpte_page));
160 BUG_ON(PageCompound(kpte_page));
161
162 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
163 if (!pte_huge(*kpte)) {
164 set_pte_atomic(kpte, mk_pte(page, prot));
165 } else {
166 pgprot_t ref_prot;
167 struct page *split;
168
169 ref_prot =
170 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
171 ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
172 split = split_large_page(address, prot, ref_prot);
173 if (!split)
174 return -ENOMEM;
175 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
176 kpte_page = split;
177 }
178 page_private(kpte_page)++;
179 } else if (!pte_huge(*kpte)) {
180 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
181 BUG_ON(page_private(kpte_page) == 0);
182 page_private(kpte_page)--;
183 } else
184 BUG();
185
186 /*
187 * If the pte was reserved, it means it was created at boot
188 * time (not via split_large_page) and in turn we must not
189 * replace it with a largepage.
190 */
191
192 save_page(kpte_page);
193 if (!PageReserved(kpte_page)) {
194 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
195 paravirt_release_pt(page_to_pfn(kpte_page));
196 revert_page(kpte_page, address);
197 }
198 }
199 return 0;
200}
201
202static inline void flush_map(struct list_head *l)
203{
204 on_each_cpu(flush_kernel_map, l, 1, 1);
205}
206
207/*
208 * Change the page attributes of an page in the linear mapping.
209 *
210 * This should be used when a page is mapped with a different caching policy
211 * than write-back somewhere - some CPUs do not like it when mappings with
212 * different caching policies exist. This changes the page attributes of the
213 * in kernel linear mapping too.
214 *
215 * The caller needs to ensure that there are no conflicting mappings elsewhere.
216 * This function only deals with the kernel linear map.
217 *
218 * Caller must call global_flush_tlb() after this.
219 */
220int change_page_attr(struct page *page, int numpages, pgprot_t prot)
221{
222 int err = 0;
223 int i;
224 unsigned long flags;
225
226 spin_lock_irqsave(&cpa_lock, flags);
227 for (i = 0; i < numpages; i++, page++) {
228 err = __change_page_attr(page, prot);
229 if (err)
230 break;
231 }
232 spin_unlock_irqrestore(&cpa_lock, flags);
233 return err;
234}
235
236void global_flush_tlb(void)
237{
238 struct list_head l;
239 struct page *pg, *next;
240
241 BUG_ON(irqs_disabled());
242
243 spin_lock_irq(&cpa_lock);
244 list_replace_init(&df_list, &l);
245 spin_unlock_irq(&cpa_lock);
246 flush_map(&l);
247 list_for_each_entry_safe(pg, next, &l, lru) {
248 list_del(&pg->lru);
249 clear_bit(PG_arch_1, &pg->flags);
250 if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
251 continue;
252 ClearPagePrivate(pg);
253 __free_page(pg);
254 }
255}
256
257#ifdef CONFIG_DEBUG_PAGEALLOC
258void kernel_map_pages(struct page *page, int numpages, int enable)
259{
260 if (PageHighMem(page))
261 return;
262 if (!enable)
263 debug_check_no_locks_freed(page_address(page),
264 numpages * PAGE_SIZE);
265
266 /* the return value is ignored - the calls cannot fail,
267 * large pages are disabled at boot time.
268 */
269 change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
270 /* we should perform an IPI and flush all tlbs,
271 * but that can deadlock->flush only current cpu.
272 */
273 __flush_tlb_all();
274}
275#endif
276
277EXPORT_SYMBOL(change_page_attr);
278EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
deleted file mode 100644
index c40afbaaf93d..000000000000
--- a/arch/x86/mm/pageattr_64.c
+++ /dev/null
@@ -1,255 +0,0 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/io.h>
15
16pte_t *lookup_address(unsigned long address)
17{
18 pgd_t *pgd = pgd_offset_k(address);
19 pud_t *pud;
20 pmd_t *pmd;
21 pte_t *pte;
22 if (pgd_none(*pgd))
23 return NULL;
24 pud = pud_offset(pgd, address);
25 if (!pud_present(*pud))
26 return NULL;
27 pmd = pmd_offset(pud, address);
28 if (!pmd_present(*pmd))
29 return NULL;
30 if (pmd_large(*pmd))
31 return (pte_t *)pmd;
32 pte = pte_offset_kernel(pmd, address);
33 if (pte && !pte_present(*pte))
34 pte = NULL;
35 return pte;
36}
37
38static struct page *split_large_page(unsigned long address, pgprot_t prot,
39 pgprot_t ref_prot)
40{
41 int i;
42 unsigned long addr;
43 struct page *base = alloc_pages(GFP_KERNEL, 0);
44 pte_t *pbase;
45 if (!base)
46 return NULL;
47 /*
48 * page_private is used to track the number of entries in
49 * the page table page have non standard attributes.
50 */
51 SetPagePrivate(base);
52 page_private(base) = 0;
53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
58 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
59 addr == address ? prot : ref_prot);
60 }
61 return base;
62}
63
64void clflush_cache_range(void *adr, int size)
65{
66 int i;
67 for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
68 clflush(adr+i);
69}
70
71static void flush_kernel_map(void *arg)
72{
73 struct list_head *l = (struct list_head *)arg;
74 struct page *pg;
75
76 /* When clflush is available always use it because it is
77 much cheaper than WBINVD. */
78 /* clflush is still broken. Disable for now. */
79 if (1 || !cpu_has_clflush)
80 asm volatile("wbinvd" ::: "memory");
81 else list_for_each_entry(pg, l, lru) {
82 void *adr = page_address(pg);
83 clflush_cache_range(adr, PAGE_SIZE);
84 }
85 __flush_tlb_all();
86}
87
88static inline void flush_map(struct list_head *l)
89{
90 on_each_cpu(flush_kernel_map, l, 1, 1);
91}
92
93static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
94
95static inline void save_page(struct page *fpage)
96{
97 if (!test_and_set_bit(PG_arch_1, &fpage->flags))
98 list_add(&fpage->lru, &deferred_pages);
99}
100
101/*
102 * No more special protections in this 2/4MB area - revert to a
103 * large page again.
104 */
105static void revert_page(unsigned long address, pgprot_t ref_prot)
106{
107 pgd_t *pgd;
108 pud_t *pud;
109 pmd_t *pmd;
110 pte_t large_pte;
111 unsigned long pfn;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
120 large_pte = pfn_pte(pfn, ref_prot);
121 large_pte = pte_mkhuge(large_pte);
122 set_pte((pte_t *)pmd, large_pte);
123}
124
125static int
126__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
127 pgprot_t ref_prot)
128{
129 pte_t *kpte;
130 struct page *kpte_page;
131 pgprot_t ref_prot2;
132
133 kpte = lookup_address(address);
134 if (!kpte) return 0;
135 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
136 BUG_ON(PageLRU(kpte_page));
137 BUG_ON(PageCompound(kpte_page));
138 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
139 if (!pte_huge(*kpte)) {
140 set_pte(kpte, pfn_pte(pfn, prot));
141 } else {
142 /*
143 * split_large_page will take the reference for this
144 * change_page_attr on the split page.
145 */
146 struct page *split;
147 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
148 split = split_large_page(address, prot, ref_prot2);
149 if (!split)
150 return -ENOMEM;
151 pgprot_val(ref_prot2) &= ~_PAGE_NX;
152 set_pte(kpte, mk_pte(split, ref_prot2));
153 kpte_page = split;
154 }
155 page_private(kpte_page)++;
156 } else if (!pte_huge(*kpte)) {
157 set_pte(kpte, pfn_pte(pfn, ref_prot));
158 BUG_ON(page_private(kpte_page) == 0);
159 page_private(kpte_page)--;
160 } else
161 BUG();
162
163 /* on x86-64 the direct mapping set at boot is not using 4k pages */
164 BUG_ON(PageReserved(kpte_page));
165
166 save_page(kpte_page);
167 if (page_private(kpte_page) == 0)
168 revert_page(address, ref_prot);
169 return 0;
170}
171
172/*
173 * Change the page attributes of an page in the linear mapping.
174 *
175 * This should be used when a page is mapped with a different caching policy
176 * than write-back somewhere - some CPUs do not like it when mappings with
177 * different caching policies exist. This changes the page attributes of the
178 * in kernel linear mapping too.
179 *
180 * The caller needs to ensure that there are no conflicting mappings elsewhere.
181 * This function only deals with the kernel linear map.
182 *
183 * Caller must call global_flush_tlb() after this.
184 */
185int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
186{
187 int err = 0, kernel_map = 0;
188 int i;
189
190 if (address >= __START_KERNEL_map
191 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
192 address = (unsigned long)__va(__pa(address));
193 kernel_map = 1;
194 }
195
196 down_write(&init_mm.mmap_sem);
197 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
198 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
199
200 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
201 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
202 if (err)
203 break;
204 }
205 /* Handle kernel mapping too which aliases part of the
206 * lowmem */
207 if (__pa(address) < KERNEL_TEXT_SIZE) {
208 unsigned long addr2;
209 pgprot_t prot2;
210 addr2 = __START_KERNEL_map + __pa(address);
211 /* Make sure the kernel mappings stay executable */
212 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
213 err = __change_page_attr(addr2, pfn, prot2,
214 PAGE_KERNEL_EXEC);
215 }
216 }
217 up_write(&init_mm.mmap_sem);
218 return err;
219}
220
221/* Don't call this for MMIO areas that may not have a mem_map entry */
222int change_page_attr(struct page *page, int numpages, pgprot_t prot)
223{
224 unsigned long addr = (unsigned long)page_address(page);
225 return change_page_attr_addr(addr, numpages, prot);
226}
227
228void global_flush_tlb(void)
229{
230 struct page *pg, *next;
231 struct list_head l;
232
233 /*
234 * Write-protect the semaphore, to exclude two contexts
235 * doing a list_replace_init() call in parallel and to
236 * exclude new additions to the deferred_pages list:
237 */
238 down_write(&init_mm.mmap_sem);
239 list_replace_init(&deferred_pages, &l);
240 up_write(&init_mm.mmap_sem);
241
242 flush_map(&l);
243
244 list_for_each_entry_safe(pg, next, &l, lru) {
245 list_del(&pg->lru);
246 clear_bit(PG_arch_1, &pg->flags);
247 if (page_private(pg) != 0)
248 continue;
249 ClearPagePrivate(pg);
250 __free_page(pg);
251 }
252}
253
254EXPORT_SYMBOL(change_page_attr);
255EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index be61a1d845a4..2ae5999a795a 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
195 return pte; 195 return pte;
196} 196}
197 197
198void pmd_ctor(struct kmem_cache *cache, void *pmd)
199{
200 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
201}
202
203/* 198/*
204 * List of all pgd's needed for non-PAE so it can invalidate entries 199 * List of all pgd's needed for non-PAE so it can invalidate entries
205 * in both cached and uncached pgd's; not needed for PAE since the 200 * in both cached and uncached pgd's; not needed for PAE since the
@@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd)
210 * vmalloc faults work because attached pagetables are never freed. 205 * vmalloc faults work because attached pagetables are never freed.
211 * -- wli 206 * -- wli
212 */ 207 */
213DEFINE_SPINLOCK(pgd_lock);
214struct page *pgd_list;
215
216static inline void pgd_list_add(pgd_t *pgd) 208static inline void pgd_list_add(pgd_t *pgd)
217{ 209{
218 struct page *page = virt_to_page(pgd); 210 struct page *page = virt_to_page(pgd);
219 page->index = (unsigned long)pgd_list; 211
220 if (pgd_list) 212 list_add(&page->lru, &pgd_list);
221 set_page_private(pgd_list, (unsigned long)&page->index);
222 pgd_list = page;
223 set_page_private(page, (unsigned long)&pgd_list);
224} 213}
225 214
226static inline void pgd_list_del(pgd_t *pgd) 215static inline void pgd_list_del(pgd_t *pgd)
227{ 216{
228 struct page *next, **pprev, *page = virt_to_page(pgd); 217 struct page *page = virt_to_page(pgd);
229 next = (struct page *)page->index; 218
230 pprev = (struct page **)page_private(page); 219 list_del(&page->lru);
231 *pprev = next;
232 if (next)
233 set_page_private(next, (unsigned long)pprev);
234} 220}
235 221
236 222
@@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd)
285 if (SHARED_KERNEL_PMD) 271 if (SHARED_KERNEL_PMD)
286 return; 272 return;
287 273
288 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
289 spin_lock_irqsave(&pgd_lock, flags); 274 spin_lock_irqsave(&pgd_lock, flags);
290 pgd_list_del(pgd); 275 pgd_list_del(pgd);
291 spin_unlock_irqrestore(&pgd_lock, flags); 276 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd)
294#define UNSHARED_PTRS_PER_PGD \ 279#define UNSHARED_PTRS_PER_PGD \
295 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) 280 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
296 281
297/* If we allocate a pmd for part of the kernel address space, then 282#ifdef CONFIG_X86_PAE
298 make sure its initialized with the appropriate kernel mappings. 283/*
299 Otherwise use a cached zeroed pmd. */ 284 * Mop up any pmd pages which may still be attached to the pgd.
300static pmd_t *pmd_cache_alloc(int idx) 285 * Normally they will be freed by munmap/exit_mmap, but any pmd we
286 * preallocate which never got a corresponding vma will need to be
287 * freed manually.
288 */
289static void pgd_mop_up_pmds(pgd_t *pgdp)
301{ 290{
302 pmd_t *pmd; 291 int i;
303 292
304 if (idx >= USER_PTRS_PER_PGD) { 293 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
305 pmd = (pmd_t *)__get_free_page(GFP_KERNEL); 294 pgd_t pgd = pgdp[i];
306 295
307 if (pmd) 296 if (pgd_val(pgd) != 0) {
308 memcpy(pmd, 297 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
309 (void *)pgd_page_vaddr(swapper_pg_dir[idx]), 298
299 pgdp[i] = native_make_pgd(0);
300
301 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
302 pmd_free(pmd);
303 }
304 }
305}
306
307/*
308 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
309 * updating the top-level pagetable entries to guarantee the
310 * processor notices the update. Since this is expensive, and
311 * all 4 top-level entries are used almost immediately in a
312 * new process's life, we just pre-populate them here.
313 *
314 * Also, if we're in a paravirt environment where the kernel pmd is
315 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
316 * and initialize the kernel pmds here.
317 */
318static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
319{
320 pud_t *pud;
321 unsigned long addr;
322 int i;
323
324 pud = pud_offset(pgd, 0);
325 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
326 i++, pud++, addr += PUD_SIZE) {
327 pmd_t *pmd = pmd_alloc_one(mm, addr);
328
329 if (!pmd) {
330 pgd_mop_up_pmds(pgd);
331 return 0;
332 }
333
334 if (i >= USER_PTRS_PER_PGD)
335 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
310 sizeof(pmd_t) * PTRS_PER_PMD); 336 sizeof(pmd_t) * PTRS_PER_PMD);
311 } else
312 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
313 337
314 return pmd; 338 pud_populate(mm, pud, pmd);
339 }
340
341 return 1;
342}
343#else /* !CONFIG_X86_PAE */
344/* No need to prepopulate any pagetable entries in non-PAE modes. */
345static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
346{
347 return 1;
315} 348}
316 349
317static void pmd_cache_free(pmd_t *pmd, int idx) 350static void pgd_mop_up_pmds(pgd_t *pgd)
318{ 351{
319 if (idx >= USER_PTRS_PER_PGD)
320 free_page((unsigned long)pmd);
321 else
322 kmem_cache_free(pmd_cache, pmd);
323} 352}
353#endif /* CONFIG_X86_PAE */
324 354
325pgd_t *pgd_alloc(struct mm_struct *mm) 355pgd_t *pgd_alloc(struct mm_struct *mm)
326{ 356{
327 int i;
328 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); 357 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
329 358
330 if (PTRS_PER_PMD == 1 || !pgd) 359 mm->pgd = pgd; /* so that alloc_pd can use it */
331 return pgd;
332 360
333 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { 361 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
334 pmd_t *pmd = pmd_cache_alloc(i); 362 quicklist_free(0, pgd_dtor, pgd);
335 363 pgd = NULL;
336 if (!pmd)
337 goto out_oom;
338
339 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
340 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
341 } 364 }
342 return pgd;
343 365
344out_oom: 366 return pgd;
345 for (i--; i >= 0; i--) {
346 pgd_t pgdent = pgd[i];
347 void* pmd = (void *)__va(pgd_val(pgdent)-1);
348 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
349 pmd_cache_free(pmd, i);
350 }
351 quicklist_free(0, pgd_dtor, pgd);
352 return NULL;
353} 367}
354 368
355void pgd_free(pgd_t *pgd) 369void pgd_free(pgd_t *pgd)
356{ 370{
357 int i; 371 pgd_mop_up_pmds(pgd);
358
359 /* in the PAE case user pgd entries are overwritten before usage */
360 if (PTRS_PER_PMD > 1)
361 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
362 pgd_t pgdent = pgd[i];
363 void* pmd = (void *)__va(pgd_val(pgdent)-1);
364 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
365 pmd_cache_free(pmd, i);
366 }
367 /* in the non-PAE case, free_pgtables() clears user pgd entries */
368 quicklist_free(0, pgd_dtor, pgd); 372 quicklist_free(0, pgd_dtor, pgd);
369} 373}
370 374
@@ -372,4 +376,3 @@ void check_pgt_cache(void)
372{ 376{
373 quicklist_trim(0, pgd_dtor, 25, 16); 377 quicklist_trim(0, pgd_dtor, 25, 16);
374} 378}
375
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index ea85172fc0cc..65416f843e59 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) 130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131{ 131{
132 int pxm, node; 132 int pxm, node;
133 int apic_id;
134
135 apic_id = pa->apic_id;
133 if (srat_disabled()) 136 if (srat_disabled())
134 return; 137 return;
135 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { 138 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
145 bad_srat(); 148 bad_srat();
146 return; 149 return;
147 } 150 }
148 apicid_to_node[pa->apic_id] = node; 151 apicid_to_node[apic_id] = node;
149 acpi_numa = 1; 152 acpi_numa = 1;
150 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 153 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
151 pxm, pa->apic_id, node); 154 pxm, apic_id, node);
152}
153
154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155/*
156 * Protect against too large hotadd areas that would fill up memory.
157 */
158static int hotadd_enough_memory(struct bootnode *nd)
159{
160 static unsigned long allocated;
161 static unsigned long last_area_end;
162 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
163 long mem = pages * sizeof(struct page);
164 unsigned long addr;
165 unsigned long allowed;
166 unsigned long oldpages = pages;
167
168 if (mem < 0)
169 return 0;
170 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 allowed = (allowed / 100) * hotadd_percent;
172 if (allocated + mem > allowed) {
173 unsigned long range;
174 /* Give them at least part of their hotadd memory upto hotadd_percent
175 It would be better to spread the limit out
176 over multiple hotplug areas, but that is too complicated
177 right now */
178 if (allocated >= allowed)
179 return 0;
180 range = allowed - allocated;
181 pages = (range / PAGE_SIZE);
182 mem = pages * sizeof(struct page);
183 nd->end = nd->start + range;
184 }
185 /* Not completely fool proof, but a good sanity check */
186 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
187 if (addr == -1UL)
188 return 0;
189 if (pages != oldpages)
190 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
191 pages << PAGE_SHIFT);
192 last_area_end = addr + mem;
193 allocated += mem;
194 return 1;
195}
196
197static int update_end_of_memory(unsigned long end)
198{
199 found_add_area = 1;
200 if ((end >> PAGE_SHIFT) > end_pfn)
201 end_pfn = end >> PAGE_SHIFT;
202 return 1;
203} 155}
204 156
205static inline int save_add_info(void)
206{
207 return hotadd_percent > 0;
208}
209#else
210int update_end_of_memory(unsigned long end) {return -1;} 157int update_end_of_memory(unsigned long end) {return -1;}
211static int hotadd_enough_memory(struct bootnode *nd) {return 1;} 158static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 159#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;}
214#else 161#else
215static inline int save_add_info(void) {return 0;} 162static inline int save_add_info(void) {return 0;}
216#endif 163#endif
217#endif
218/* 164/*
219 * Update nodes_add and decide if to include add are in the zone. 165 * Update nodes_add and decide if to include add are in the zone.
220 * Both SPARSE and RESERVE need nodes_add infomation. 166 * Both SPARSE and RESERVE need nodes_add information.
221 * This code supports one contiguous hot add area per node. 167 * This code supports one contiguous hot add area per node.
222 */ 168 */
223static int reserve_hotadd(int node, unsigned long start, unsigned long end) 169static int reserve_hotadd(int node, unsigned long start, unsigned long end)
@@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
377 return 1; 323 return 1;
378} 324}
379 325
380static void unparse_node(int node) 326static void __init unparse_node(int node)
381{ 327{
382 int i; 328 int i;
383 node_clear(node, nodes_parsed); 329 node_clear(node, nodes_parsed);
@@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
400 /* First clean up the node list */ 346 /* First clean up the node list */
401 for (i = 0; i < MAX_NUMNODES; i++) { 347 for (i = 0; i < MAX_NUMNODES; i++) {
402 cutoff_node(i, start, end); 348 cutoff_node(i, start, end);
403 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { 349 /*
350 * don't confuse VM with a node that doesn't have the
351 * minimum memory.
352 */
353 if (nodes[i].end &&
354 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
404 unparse_node(i); 355 unparse_node(i);
405 node_set_offline(i); 356 node_set_offline(i);
406 } 357 }
@@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
431 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 382 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432 383
433 for (i = 0; i < NR_CPUS; i++) { 384 for (i = 0; i < NR_CPUS; i++) {
434 if (cpu_to_node(i) == NUMA_NO_NODE) 385 int node = early_cpu_to_node(i);
386
387 if (node == NUMA_NO_NODE)
435 continue; 388 continue;
436 if (!node_isset(cpu_to_node(i), node_possible_map)) 389 if (!node_isset(node, node_possible_map))
437 numa_set_node(i, NUMA_NO_NODE); 390 numa_set_node(i, NUMA_NO_NODE);
438 } 391 }
439 numa_init_array(); 392 numa_init_array();
@@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
441} 394}
442 395
443#ifdef CONFIG_NUMA_EMU 396#ifdef CONFIG_NUMA_EMU
397static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
398 [0 ... MAX_NUMNODES-1] = PXM_INVAL
399};
400static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
401 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
402};
444static int __init find_node_by_addr(unsigned long addr) 403static int __init find_node_by_addr(unsigned long addr)
445{ 404{
446 int ret = NUMA_NO_NODE; 405 int ret = NUMA_NO_NODE;
@@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr)
457 break; 416 break;
458 } 417 }
459 } 418 }
460 return i; 419 return ret;
461} 420}
462 421
463/* 422/*
@@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr)
471void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) 430void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
472{ 431{
473 int i, j; 432 int i, j;
474 int fake_node_to_pxm_map[MAX_NUMNODES] = {
475 [0 ... MAX_NUMNODES-1] = PXM_INVAL
476 };
477 unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
478 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
479 };
480 433
481 printk(KERN_INFO "Faking PXM affinity for fake nodes on real " 434 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
482 "topology.\n"); 435 "topology.\n");
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 0ed046a187f7..e2095cba409f 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name)
32 return 0; 32 return 0;
33} 33}
34 34
35static void backtrace_address(void *data, unsigned long addr) 35static void backtrace_address(void *data, unsigned long addr, int reliable)
36{ 36{
37 unsigned int *depth = data; 37 unsigned int *depth = data;
38 38
@@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = {
48}; 48};
49 49
50struct frame_head { 50struct frame_head {
51 struct frame_head *ebp; 51 struct frame_head *bp;
52 unsigned long ret; 52 unsigned long ret;
53} __attribute__((packed)); 53} __attribute__((packed));
54 54
@@ -67,21 +67,21 @@ dump_user_backtrace(struct frame_head * head)
67 67
68 /* frame pointers should strictly progress back up the stack 68 /* frame pointers should strictly progress back up the stack
69 * (towards higher addresses) */ 69 * (towards higher addresses) */
70 if (head >= bufhead[0].ebp) 70 if (head >= bufhead[0].bp)
71 return NULL; 71 return NULL;
72 72
73 return bufhead[0].ebp; 73 return bufhead[0].bp;
74} 74}
75 75
76void 76void
77x86_backtrace(struct pt_regs * const regs, unsigned int depth) 77x86_backtrace(struct pt_regs * const regs, unsigned int depth)
78{ 78{
79 struct frame_head *head = (struct frame_head *)frame_pointer(regs); 79 struct frame_head *head = (struct frame_head *)frame_pointer(regs);
80 unsigned long stack = stack_pointer(regs); 80 unsigned long stack = kernel_trap_sp(regs);
81 81
82 if (!user_mode_vm(regs)) { 82 if (!user_mode_vm(regs)) {
83 if (depth) 83 if (depth)
84 dump_trace(NULL, regs, (unsigned long *)stack, 84 dump_trace(NULL, regs, (unsigned long *)stack, 0,
85 &backtrace_ops, &depth); 85 &backtrace_ops, &depth);
86 return; 86 return;
87 } 87 }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c8ab79ef4276..1f11cf0a307f 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -18,11 +18,11 @@
18#include <asm/nmi.h> 18#include <asm/nmi.h>
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/apic.h> 20#include <asm/apic.h>
21 21
22#include "op_counter.h" 22#include "op_counter.h"
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24 24
25static struct op_x86_model_spec const * model; 25static struct op_x86_model_spec const *model;
26static struct op_msrs cpu_msrs[NR_CPUS]; 26static struct op_msrs cpu_msrs[NR_CPUS];
27static unsigned long saved_lvtpc[NR_CPUS]; 27static unsigned long saved_lvtpc[NR_CPUS];
28 28
@@ -41,7 +41,6 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
41 return 0; 41 return 0;
42} 42}
43 43
44
45static int nmi_resume(struct sys_device *dev) 44static int nmi_resume(struct sys_device *dev)
46{ 45{
47 if (nmi_enabled == 1) 46 if (nmi_enabled == 1)
@@ -49,29 +48,27 @@ static int nmi_resume(struct sys_device *dev)
49 return 0; 48 return 0;
50} 49}
51 50
52
53static struct sysdev_class oprofile_sysclass = { 51static struct sysdev_class oprofile_sysclass = {
54 .name = "oprofile", 52 .name = "oprofile",
55 .resume = nmi_resume, 53 .resume = nmi_resume,
56 .suspend = nmi_suspend, 54 .suspend = nmi_suspend,
57}; 55};
58 56
59
60static struct sys_device device_oprofile = { 57static struct sys_device device_oprofile = {
61 .id = 0, 58 .id = 0,
62 .cls = &oprofile_sysclass, 59 .cls = &oprofile_sysclass,
63}; 60};
64 61
65
66static int __init init_sysfs(void) 62static int __init init_sysfs(void)
67{ 63{
68 int error; 64 int error;
69 if (!(error = sysdev_class_register(&oprofile_sysclass))) 65
66 error = sysdev_class_register(&oprofile_sysclass);
67 if (!error)
70 error = sysdev_register(&device_oprofile); 68 error = sysdev_register(&device_oprofile);
71 return error; 69 return error;
72} 70}
73 71
74
75static void exit_sysfs(void) 72static void exit_sysfs(void)
76{ 73{
77 sysdev_unregister(&device_oprofile); 74 sysdev_unregister(&device_oprofile);
@@ -90,7 +87,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
90 int ret = NOTIFY_DONE; 87 int ret = NOTIFY_DONE;
91 int cpu = smp_processor_id(); 88 int cpu = smp_processor_id();
92 89
93 switch(val) { 90 switch (val) {
94 case DIE_NMI: 91 case DIE_NMI:
95 if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) 92 if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
96 ret = NOTIFY_STOP; 93 ret = NOTIFY_STOP;
@@ -101,24 +98,24 @@ static int profile_exceptions_notify(struct notifier_block *self,
101 return ret; 98 return ret;
102} 99}
103 100
104static void nmi_cpu_save_registers(struct op_msrs * msrs) 101static void nmi_cpu_save_registers(struct op_msrs *msrs)
105{ 102{
106 unsigned int const nr_ctrs = model->num_counters; 103 unsigned int const nr_ctrs = model->num_counters;
107 unsigned int const nr_ctrls = model->num_controls; 104 unsigned int const nr_ctrls = model->num_controls;
108 struct op_msr * counters = msrs->counters; 105 struct op_msr *counters = msrs->counters;
109 struct op_msr * controls = msrs->controls; 106 struct op_msr *controls = msrs->controls;
110 unsigned int i; 107 unsigned int i;
111 108
112 for (i = 0; i < nr_ctrs; ++i) { 109 for (i = 0; i < nr_ctrs; ++i) {
113 if (counters[i].addr){ 110 if (counters[i].addr) {
114 rdmsr(counters[i].addr, 111 rdmsr(counters[i].addr,
115 counters[i].saved.low, 112 counters[i].saved.low,
116 counters[i].saved.high); 113 counters[i].saved.high);
117 } 114 }
118 } 115 }
119 116
120 for (i = 0; i < nr_ctrls; ++i) { 117 for (i = 0; i < nr_ctrls; ++i) {
121 if (controls[i].addr){ 118 if (controls[i].addr) {
122 rdmsr(controls[i].addr, 119 rdmsr(controls[i].addr,
123 controls[i].saved.low, 120 controls[i].saved.low,
124 controls[i].saved.high); 121 controls[i].saved.high);
@@ -126,15 +123,13 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs)
126 } 123 }
127} 124}
128 125
129 126static void nmi_save_registers(void *dummy)
130static void nmi_save_registers(void * dummy)
131{ 127{
132 int cpu = smp_processor_id(); 128 int cpu = smp_processor_id();
133 struct op_msrs * msrs = &cpu_msrs[cpu]; 129 struct op_msrs *msrs = &cpu_msrs[cpu];
134 nmi_cpu_save_registers(msrs); 130 nmi_cpu_save_registers(msrs);
135} 131}
136 132
137
138static void free_msrs(void) 133static void free_msrs(void)
139{ 134{
140 int i; 135 int i;
@@ -146,7 +141,6 @@ static void free_msrs(void)
146 } 141 }
147} 142}
148 143
149
150static int allocate_msrs(void) 144static int allocate_msrs(void)
151{ 145{
152 int success = 1; 146 int success = 1;
@@ -173,11 +167,10 @@ static int allocate_msrs(void)
173 return success; 167 return success;
174} 168}
175 169
176 170static void nmi_cpu_setup(void *dummy)
177static void nmi_cpu_setup(void * dummy)
178{ 171{
179 int cpu = smp_processor_id(); 172 int cpu = smp_processor_id();
180 struct op_msrs * msrs = &cpu_msrs[cpu]; 173 struct op_msrs *msrs = &cpu_msrs[cpu];
181 spin_lock(&oprofilefs_lock); 174 spin_lock(&oprofilefs_lock);
182 model->setup_ctrs(msrs); 175 model->setup_ctrs(msrs);
183 spin_unlock(&oprofilefs_lock); 176 spin_unlock(&oprofilefs_lock);
@@ -193,13 +186,14 @@ static struct notifier_block profile_exceptions_nb = {
193 186
194static int nmi_setup(void) 187static int nmi_setup(void)
195{ 188{
196 int err=0; 189 int err = 0;
197 int cpu; 190 int cpu;
198 191
199 if (!allocate_msrs()) 192 if (!allocate_msrs())
200 return -ENOMEM; 193 return -ENOMEM;
201 194
202 if ((err = register_die_notifier(&profile_exceptions_nb))){ 195 err = register_die_notifier(&profile_exceptions_nb);
196 if (err) {
203 free_msrs(); 197 free_msrs();
204 return err; 198 return err;
205 } 199 }
@@ -210,7 +204,7 @@ static int nmi_setup(void)
210 204
211 /* Assume saved/restored counters are the same on all CPUs */ 205 /* Assume saved/restored counters are the same on all CPUs */
212 model->fill_in_addresses(&cpu_msrs[0]); 206 model->fill_in_addresses(&cpu_msrs[0]);
213 for_each_possible_cpu (cpu) { 207 for_each_possible_cpu(cpu) {
214 if (cpu != 0) { 208 if (cpu != 0) {
215 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, 209 memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
216 sizeof(struct op_msr) * model->num_counters); 210 sizeof(struct op_msr) * model->num_counters);
@@ -226,39 +220,37 @@ static int nmi_setup(void)
226 return 0; 220 return 0;
227} 221}
228 222
229 223static void nmi_restore_registers(struct op_msrs *msrs)
230static void nmi_restore_registers(struct op_msrs * msrs)
231{ 224{
232 unsigned int const nr_ctrs = model->num_counters; 225 unsigned int const nr_ctrs = model->num_counters;
233 unsigned int const nr_ctrls = model->num_controls; 226 unsigned int const nr_ctrls = model->num_controls;
234 struct op_msr * counters = msrs->counters; 227 struct op_msr *counters = msrs->counters;
235 struct op_msr * controls = msrs->controls; 228 struct op_msr *controls = msrs->controls;
236 unsigned int i; 229 unsigned int i;
237 230
238 for (i = 0; i < nr_ctrls; ++i) { 231 for (i = 0; i < nr_ctrls; ++i) {
239 if (controls[i].addr){ 232 if (controls[i].addr) {
240 wrmsr(controls[i].addr, 233 wrmsr(controls[i].addr,
241 controls[i].saved.low, 234 controls[i].saved.low,
242 controls[i].saved.high); 235 controls[i].saved.high);
243 } 236 }
244 } 237 }
245 238
246 for (i = 0; i < nr_ctrs; ++i) { 239 for (i = 0; i < nr_ctrs; ++i) {
247 if (counters[i].addr){ 240 if (counters[i].addr) {
248 wrmsr(counters[i].addr, 241 wrmsr(counters[i].addr,
249 counters[i].saved.low, 242 counters[i].saved.low,
250 counters[i].saved.high); 243 counters[i].saved.high);
251 } 244 }
252 } 245 }
253} 246}
254
255 247
256static void nmi_cpu_shutdown(void * dummy) 248static void nmi_cpu_shutdown(void *dummy)
257{ 249{
258 unsigned int v; 250 unsigned int v;
259 int cpu = smp_processor_id(); 251 int cpu = smp_processor_id();
260 struct op_msrs * msrs = &cpu_msrs[cpu]; 252 struct op_msrs *msrs = &cpu_msrs[cpu];
261 253
262 /* restoring APIC_LVTPC can trigger an apic error because the delivery 254 /* restoring APIC_LVTPC can trigger an apic error because the delivery
263 * mode and vector nr combination can be illegal. That's by design: on 255 * mode and vector nr combination can be illegal. That's by design: on
264 * power on apic lvt contain a zero vector nr which are legal only for 256 * power on apic lvt contain a zero vector nr which are legal only for
@@ -271,7 +263,6 @@ static void nmi_cpu_shutdown(void * dummy)
271 nmi_restore_registers(msrs); 263 nmi_restore_registers(msrs);
272} 264}
273 265
274
275static void nmi_shutdown(void) 266static void nmi_shutdown(void)
276{ 267{
277 nmi_enabled = 0; 268 nmi_enabled = 0;
@@ -281,45 +272,40 @@ static void nmi_shutdown(void)
281 free_msrs(); 272 free_msrs();
282} 273}
283 274
284 275static void nmi_cpu_start(void *dummy)
285static void nmi_cpu_start(void * dummy)
286{ 276{
287 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; 277 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
288 model->start(msrs); 278 model->start(msrs);
289} 279}
290
291 280
292static int nmi_start(void) 281static int nmi_start(void)
293{ 282{
294 on_each_cpu(nmi_cpu_start, NULL, 0, 1); 283 on_each_cpu(nmi_cpu_start, NULL, 0, 1);
295 return 0; 284 return 0;
296} 285}
297 286
298 287static void nmi_cpu_stop(void *dummy)
299static void nmi_cpu_stop(void * dummy)
300{ 288{
301 struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; 289 struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
302 model->stop(msrs); 290 model->stop(msrs);
303} 291}
304 292
305
306static void nmi_stop(void) 293static void nmi_stop(void)
307{ 294{
308 on_each_cpu(nmi_cpu_stop, NULL, 0, 1); 295 on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
309} 296}
310 297
311
312struct op_counter_config counter_config[OP_MAX_COUNTER]; 298struct op_counter_config counter_config[OP_MAX_COUNTER];
313 299
314static int nmi_create_files(struct super_block * sb, struct dentry * root) 300static int nmi_create_files(struct super_block *sb, struct dentry *root)
315{ 301{
316 unsigned int i; 302 unsigned int i;
317 303
318 for (i = 0; i < model->num_counters; ++i) { 304 for (i = 0; i < model->num_counters; ++i) {
319 struct dentry * dir; 305 struct dentry *dir;
320 char buf[4]; 306 char buf[4];
321 307
322 /* quick little hack to _not_ expose a counter if it is not 308 /* quick little hack to _not_ expose a counter if it is not
323 * available for use. This should protect userspace app. 309 * available for use. This should protect userspace app.
324 * NOTE: assumes 1:1 mapping here (that counters are organized 310 * NOTE: assumes 1:1 mapping here (that counters are organized
325 * sequentially in their struct assignment). 311 * sequentially in their struct assignment).
@@ -329,21 +315,21 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root)
329 315
330 snprintf(buf, sizeof(buf), "%d", i); 316 snprintf(buf, sizeof(buf), "%d", i);
331 dir = oprofilefs_mkdir(sb, root, buf); 317 dir = oprofilefs_mkdir(sb, root, buf);
332 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 318 oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
333 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 319 oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
334 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 320 oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
335 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 321 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
336 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 322 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
337 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 323 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
338 } 324 }
339 325
340 return 0; 326 return 0;
341} 327}
342 328
343static int p4force; 329static int p4force;
344module_param(p4force, int, 0); 330module_param(p4force, int, 0);
345 331
346static int __init p4_init(char ** cpu_type) 332static int __init p4_init(char **cpu_type)
347{ 333{
348 __u8 cpu_model = boot_cpu_data.x86_model; 334 __u8 cpu_model = boot_cpu_data.x86_model;
349 335
@@ -356,15 +342,15 @@ static int __init p4_init(char ** cpu_type)
356 return 1; 342 return 1;
357#else 343#else
358 switch (smp_num_siblings) { 344 switch (smp_num_siblings) {
359 case 1: 345 case 1:
360 *cpu_type = "i386/p4"; 346 *cpu_type = "i386/p4";
361 model = &op_p4_spec; 347 model = &op_p4_spec;
362 return 1; 348 return 1;
363 349
364 case 2: 350 case 2:
365 *cpu_type = "i386/p4-ht"; 351 *cpu_type = "i386/p4-ht";
366 model = &op_p4_ht2_spec; 352 model = &op_p4_ht2_spec;
367 return 1; 353 return 1;
368 } 354 }
369#endif 355#endif
370 356
@@ -373,8 +359,7 @@ static int __init p4_init(char ** cpu_type)
373 return 0; 359 return 0;
374} 360}
375 361
376 362static int __init ppro_init(char **cpu_type)
377static int __init ppro_init(char ** cpu_type)
378{ 363{
379 __u8 cpu_model = boot_cpu_data.x86_model; 364 __u8 cpu_model = boot_cpu_data.x86_model;
380 365
@@ -409,52 +394,52 @@ int __init op_nmi_init(struct oprofile_operations *ops)
409 394
410 if (!cpu_has_apic) 395 if (!cpu_has_apic)
411 return -ENODEV; 396 return -ENODEV;
412 397
413 switch (vendor) { 398 switch (vendor) {
414 case X86_VENDOR_AMD: 399 case X86_VENDOR_AMD:
415 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 400 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
416 401
417 switch (family) { 402 switch (family) {
418 default: 403 default:
404 return -ENODEV;
405 case 6:
406 model = &op_athlon_spec;
407 cpu_type = "i386/athlon";
408 break;
409 case 0xf:
410 model = &op_athlon_spec;
411 /* Actually it could be i386/hammer too, but give
412 user space an consistent name. */
413 cpu_type = "x86-64/hammer";
414 break;
415 case 0x10:
416 model = &op_athlon_spec;
417 cpu_type = "x86-64/family10";
418 break;
419 }
420 break;
421
422 case X86_VENDOR_INTEL:
423 switch (family) {
424 /* Pentium IV */
425 case 0xf:
426 if (!p4_init(&cpu_type))
419 return -ENODEV; 427 return -ENODEV;
420 case 6:
421 model = &op_athlon_spec;
422 cpu_type = "i386/athlon";
423 break;
424 case 0xf:
425 model = &op_athlon_spec;
426 /* Actually it could be i386/hammer too, but give
427 user space an consistent name. */
428 cpu_type = "x86-64/hammer";
429 break;
430 case 0x10:
431 model = &op_athlon_spec;
432 cpu_type = "x86-64/family10";
433 break;
434 }
435 break; 428 break;
436 429
437 case X86_VENDOR_INTEL: 430 /* A P6-class processor */
438 switch (family) { 431 case 6:
439 /* Pentium IV */ 432 if (!ppro_init(&cpu_type))
440 case 0xf: 433 return -ENODEV;
441 if (!p4_init(&cpu_type))
442 return -ENODEV;
443 break;
444
445 /* A P6-class processor */
446 case 6:
447 if (!ppro_init(&cpu_type))
448 return -ENODEV;
449 break;
450
451 default:
452 return -ENODEV;
453 }
454 break; 434 break;
455 435
456 default: 436 default:
457 return -ENODEV; 437 return -ENODEV;
438 }
439 break;
440
441 default:
442 return -ENODEV;
458 } 443 }
459 444
460 init_sysfs(); 445 init_sysfs();
@@ -469,7 +454,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
469 return 0; 454 return 0;
470} 455}
471 456
472
473void op_nmi_exit(void) 457void op_nmi_exit(void)
474{ 458{
475 if (using_nmi) 459 if (using_nmi)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 862746390666..52deabc72a6f 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -109,6 +109,19 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
109 } 109 }
110} 110}
111 111
112static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
113{
114 struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
115
116 if (rom_r->parent)
117 return;
118 if (rom_r->start)
119 /* we deal with BIOS assigned ROM later */
120 return;
121 if (!(pci_probe & PCI_ASSIGN_ROMS))
122 rom_r->start = rom_r->end = rom_r->flags = 0;
123}
124
112/* 125/*
113 * Called after each bus is probed, but before its children 126 * Called after each bus is probed, but before its children
114 * are examined. 127 * are examined.
@@ -116,8 +129,12 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
116 129
117void __devinit pcibios_fixup_bus(struct pci_bus *b) 130void __devinit pcibios_fixup_bus(struct pci_bus *b)
118{ 131{
132 struct pci_dev *dev;
133
119 pcibios_fixup_ghosts(b); 134 pcibios_fixup_ghosts(b);
120 pci_read_bridge_bases(b); 135 pci_read_bridge_bases(b);
136 list_for_each_entry(dev, &b->devices, bus_list)
137 pcibios_fixup_device_resources(dev);
121} 138}
122 139
123/* 140/*
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6cff66dd0c91..cb63007e20b2 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -19,7 +19,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
19 19
20 printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 20 printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
21 reg = 0xd0; 21 reg = 0xd0;
22 for(pxb=0; pxb<2; pxb++) { 22 for(pxb = 0; pxb < 2; pxb++) {
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
@@ -56,7 +56,7 @@ static void __devinit pci_fixup_umc_ide(struct pci_dev *d)
56 int i; 56 int i;
57 57
58 printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d)); 58 printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d));
59 for(i=0; i<4; i++) 59 for(i = 0; i < 4; i++)
60 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; 60 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
61} 61}
62DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); 62DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
@@ -127,7 +127,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
127 NB latency to zero */ 127 NB latency to zero */
128 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0); 128 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
129 129
130 where = 0x95; /* the memory write queue timer register is 130 where = 0x95; /* the memory write queue timer register is
131 different for the KT266x's: 0x95 not 0x55 */ 131 different for the KT266x's: 0x95 not 0x55 */
132 } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && 132 } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
133 (d->revision == VIA_8363_KL133_REVISION_ID || 133 (d->revision == VIA_8363_KL133_REVISION_ID ||
@@ -230,7 +230,7 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh
230 230
231 if ((offset) && (where == offset)) 231 if ((offset) && (where == offset))
232 value = value & 0xfffffffc; 232 value = value & 0xfffffffc;
233 233
234 return raw_pci_ops->write(0, bus->number, devfn, where, size, value); 234 return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
235} 235}
236 236
@@ -271,8 +271,8 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
271 * after hot-remove, the pbus->devices is empty and this code 271 * after hot-remove, the pbus->devices is empty and this code
272 * will set the offsets to zero and the bus ops to parent's bus 272 * will set the offsets to zero and the bus ops to parent's bus
273 * ops, which is unmodified. 273 * ops, which is unmodified.
274 */ 274 */
275 for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) 275 for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
276 quirk_aspm_offset[i] = 0; 276 quirk_aspm_offset[i] = 0;
277 277
278 pbus->ops = pbus->parent->ops; 278 pbus->ops = pbus->parent->ops;
@@ -286,17 +286,17 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
286 list_for_each_entry(dev, &pbus->devices, bus_list) { 286 list_for_each_entry(dev, &pbus->devices, bus_list) {
287 /* There are 0 to 8 devices attached to this bus */ 287 /* There are 0 to 8 devices attached to this bus */
288 cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP); 288 cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP);
289 quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10; 289 quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] = cap_base + 0x10;
290 } 290 }
291 pbus->ops = &quirk_pcie_aspm_ops; 291 pbus->ops = &quirk_pcie_aspm_ops;
292 } 292 }
293} 293}
294DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk ); 294DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk);
295DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk ); 295DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk);
296DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk ); 296DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk);
297DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk ); 297DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk);
298DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); 298DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk);
299DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); 299DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk);
300 300
301/* 301/*
302 * Fixup to mark boot BIOS video selected by BIOS before it changes 302 * Fixup to mark boot BIOS video selected by BIOS before it changes
@@ -336,8 +336,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
336 * PCI header type NORMAL. 336 * PCI header type NORMAL.
337 */ 337 */
338 if (bridge 338 if (bridge
339 &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) 339 && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
340 ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { 340 || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
341 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, 341 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
342 &config); 342 &config);
343 if (!(config & PCI_BRIDGE_CTL_VGA)) 343 if (!(config & PCI_BRIDGE_CTL_VGA))
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 88d8f5c0ecb5..ed07ce6c171b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -200,6 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
200{ 200{
201 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; 201 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
202 202
203 WARN_ON_ONCE(pirq >= 16);
203 return irqmap[read_config_nybble(router, 0x48, pirq-1)]; 204 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
204} 205}
205 206
@@ -207,7 +208,8 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
207{ 208{
208 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; 209 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
209 unsigned int val = irqmap[irq]; 210 unsigned int val = irqmap[irq];
210 211
212 WARN_ON_ONCE(pirq >= 16);
211 if (val) { 213 if (val) {
212 write_config_nybble(router, 0x48, pirq-1, val); 214 write_config_nybble(router, 0x48, pirq-1, val);
213 return 1; 215 return 1;
@@ -257,12 +259,16 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
257static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 259static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
258{ 260{
259 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; 261 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
262
263 WARN_ON_ONCE(pirq >= 5);
260 return read_config_nybble(router, 0x55, pirqmap[pirq-1]); 264 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
261} 265}
262 266
263static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 267static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
264{ 268{
265 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; 269 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
270
271 WARN_ON_ONCE(pirq >= 5);
266 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); 272 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
267 return 1; 273 return 1;
268} 274}
@@ -275,12 +281,16 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq
275static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 281static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
276{ 282{
277 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 283 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
284
285 WARN_ON_ONCE(pirq >= 4);
278 return read_config_nybble(router,0x43, pirqmap[pirq-1]); 286 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
279} 287}
280 288
281static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 289static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
282{ 290{
283 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; 291 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
292
293 WARN_ON_ONCE(pirq >= 4);
284 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); 294 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
285 return 1; 295 return 1;
286} 296}
@@ -419,6 +429,7 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
419 429
420static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 430static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
421{ 431{
432 WARN_ON_ONCE(pirq >= 9);
422 if (pirq > 8) { 433 if (pirq > 8) {
423 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 434 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
424 return 0; 435 return 0;
@@ -428,6 +439,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
428 439
429static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 440static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
430{ 441{
442 WARN_ON_ONCE(pirq >= 9);
431 if (pirq > 8) { 443 if (pirq > 8) {
432 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 444 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
433 return 0; 445 return 0;
@@ -449,14 +461,14 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
449 */ 461 */
450static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 462static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
451{ 463{
452 outb_p(pirq, 0xc00); 464 outb(pirq, 0xc00);
453 return inb(0xc01) & 0xf; 465 return inb(0xc01) & 0xf;
454} 466}
455 467
456static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 468static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
457{ 469{
458 outb_p(pirq, 0xc00); 470 outb(pirq, 0xc00);
459 outb_p(irq, 0xc01); 471 outb(irq, 0xc01);
460 return 1; 472 return 1;
461} 473}
462 474
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 998fd3ec0d68..efcf620d1439 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp;
19unsigned long saved_context_esi, saved_context_edi; 19unsigned long saved_context_esi, saved_context_edi;
20unsigned long saved_context_eflags; 20unsigned long saved_context_eflags;
21 21
22void __save_processor_state(struct saved_context *ctxt) 22static void __save_processor_state(struct saved_context *ctxt)
23{ 23{
24 mtrr_save_fixed_ranges(NULL); 24 mtrr_save_fixed_ranges(NULL);
25 kernel_fpu_begin(); 25 kernel_fpu_begin();
@@ -74,19 +74,19 @@ static void fix_processor_context(void)
74 /* 74 /*
75 * Now maybe reload the debug registers 75 * Now maybe reload the debug registers
76 */ 76 */
77 if (current->thread.debugreg[7]){ 77 if (current->thread.debugreg7) {
78 set_debugreg(current->thread.debugreg[0], 0); 78 set_debugreg(current->thread.debugreg0, 0);
79 set_debugreg(current->thread.debugreg[1], 1); 79 set_debugreg(current->thread.debugreg1, 1);
80 set_debugreg(current->thread.debugreg[2], 2); 80 set_debugreg(current->thread.debugreg2, 2);
81 set_debugreg(current->thread.debugreg[3], 3); 81 set_debugreg(current->thread.debugreg3, 3);
82 /* no 4 and 5 */ 82 /* no 4 and 5 */
83 set_debugreg(current->thread.debugreg[6], 6); 83 set_debugreg(current->thread.debugreg6, 6);
84 set_debugreg(current->thread.debugreg[7], 7); 84 set_debugreg(current->thread.debugreg7, 7);
85 } 85 }
86 86
87} 87}
88 88
89void __restore_processor_state(struct saved_context *ctxt) 89static void __restore_processor_state(struct saved_context *ctxt)
90{ 90{
91 /* 91 /*
92 * control registers 92 * control registers
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
index f8b69d84238e..60274d5746e1 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1 +1,6 @@
1vdso.lds 1vdso.lds
2vdso-syms.lds
3vdso32-syms.lds
4vdso32-syscall-syms.lds
5vdso32-sysenter-syms.lds
6vdso32-int80-syms.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index e7bff0fbac23..d28dda574700 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -1,39 +1,37 @@
1# 1#
2# x86-64 vDSO. 2# Building vDSO images for x86.
3# 3#
4 4
5VDSO64-$(CONFIG_X86_64) := y
6VDSO32-$(CONFIG_X86_32) := y
7VDSO32-$(CONFIG_COMPAT) := y
8
9vdso-install-$(VDSO64-y) += vdso.so
10vdso-install-$(VDSO32-y) += $(vdso32-y:=.so)
11
12
5# files to link into the vdso 13# files to link into the vdso
6# vdso-start.o has to be first 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
7vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
8 15
9# files to link into kernel 16# files to link into kernel
10obj-y := vma.o vdso.o vdso-syms.o 17obj-$(VDSO64-y) += vma.o vdso.o
18obj-$(VDSO32-y) += vdso32.o vdso32-setup.o
11 19
12vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) 20vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
13 21
14$(obj)/vdso.o: $(obj)/vdso.so 22$(obj)/vdso.o: $(obj)/vdso.so
15 23
16targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o 24targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
17
18# The DSO images are built using a special linker script.
19quiet_cmd_syscall = SYSCALL $@
20 cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
21 -Wl,-T,$(filter-out FORCE,$^) -o $@
22 25
23export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
24 27
25vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
26 $(call ld-option, -Wl$(comma)--hash-style=sysv) \ 29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
27 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
28SYSCFLAGS_vdso.so = $(vdso-flags)
29SYSCFLAGS_vdso.so.dbg = $(vdso-flags)
30 30
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
32 32
33$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
34
35$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE 33$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
36 $(call if_changed,syscall) 34 $(call if_changed,vdso)
37 35
38$(obj)/%.so: OBJCOPYFLAGS := -S 36$(obj)/%.so: OBJCOPYFLAGS := -S
39$(obj)/%.so: $(obj)/%.so.dbg FORCE 37$(obj)/%.so: $(obj)/%.so.dbg FORCE
@@ -41,24 +39,96 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
41 39
42CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 40CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
43 41
44$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL) 42$(vobjs): KBUILD_CFLAGS = $(CFL)
45$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL) 43
44targets += vdso-syms.lds
45obj-$(VDSO64-y) += vdso-syms.lds
46
47#
48# Match symbols in the DSO that look like VDSO*; produce a file of constants.
49#
50sed-vdsosym := -e 's/^00*/0/' \
51 -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
52quiet_cmd_vdsosym = VDSOSYM $@
53 cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
54
55$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
56 $(call if_changed,vdsosym)
57
58#
59# Build multiple 32-bit vDSO images to choose from at boot time.
60#
61obj-$(VDSO32-y) += vdso32-syms.lds
62vdso32.so-$(CONFIG_X86_32) += int80
63vdso32.so-$(CONFIG_COMPAT) += syscall
64vdso32.so-$(VDSO32-y) += sysenter
65
66CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
67VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
68
69# This makes sure the $(obj) subdirectory exists even though vdso32/
70# is not a kbuild sub-make subdirectory.
71override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
46 72
47# We also create a special relocatable object that should mirror the symbol 73targets += vdso32/vdso32.lds
48# table and layout of the linked DSO. With ld -R we can then refer to 74targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so)
49# these symbols in the kernel code rather than hand-coded addresses. 75targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o)
50extra-y += vdso-syms.o
51$(obj)/built-in.o: $(obj)/vdso-syms.o
52$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
53 76
54SYSCFLAGS_vdso-syms.o = -r -d 77extra-y += $(vdso32.so-y:%=vdso32-%.so)
55$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
56 $(call if_changed,syscall)
57 78
79$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so)
80
81KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
82$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
83$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32
84
85$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
86 $(obj)/vdso32/vdso32.lds \
87 $(obj)/vdso32/note.o \
88 $(obj)/vdso32/%.o
89 $(call if_changed,vdso)
90
91# Make vdso32-*-syms.lds from each image, and then make sure they match.
92# The only difference should be that some do not define VDSO32_SYSENTER_RETURN.
93
94targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds)
95
96quiet_cmd_vdso32sym = VDSOSYM $@
97define cmd_vdso32sym
98 if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \
99 $(foreach H,$(filter-out FORCE,$^),\
100 if grep -q VDSO32_SYSENTER_RETURN $H; \
101 then diff -u $(@D)/.tmp_$(@F) $H; \
102 else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \
103 diff -u - $H; fi &&) : ;\
104 then mv -f $(@D)/.tmp_$(@F) $@; \
105 else rm -f $(@D)/.tmp_$(@F); exit 1; \
106 fi
107endef
108
109$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
110 $(call if_changed,vdso32sym)
111
112#
113# The DSO images are built using a special linker script.
114#
115quiet_cmd_vdso = VDSO $@
116 cmd_vdso = $(CC) -nostdlib -o $@ \
117 $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
118 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
119
120VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
121
122#
123# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
124#
58quiet_cmd_vdso_install = INSTALL $@ 125quiet_cmd_vdso_install = INSTALL $@
59 cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ 126 cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
60vdso.so: 127$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
61 @mkdir -p $(MODLIB)/vdso 128 @mkdir -p $(MODLIB)/vdso
62 $(call cmd,vdso_install) 129 $(call cmd,vdso_install)
63 130
64vdso_install: vdso.so 131PHONY += vdso_install $(vdso-install-y)
132vdso_install: $(vdso-install-y)
133
134clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 5b54cdfb2b07..23476c2ebfc4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -19,7 +19,6 @@
19#include <asm/hpet.h> 19#include <asm/hpet.h>
20#include <asm/unistd.h> 20#include <asm/unistd.h>
21#include <asm/io.h> 21#include <asm/io.h>
22#include <asm/vgtod.h>
23#include "vextern.h" 22#include "vextern.h"
24 23
25#define gtod vdso_vsyscall_gtod_data 24#define gtod vdso_vsyscall_gtod_data
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
new file mode 100644
index 000000000000..634a2cf62046
--- /dev/null
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -0,0 +1,64 @@
1/*
2 * Linker script for vDSO. This is an ELF shared object prelinked to
3 * its virtual address, and with only one read-only segment.
4 * This script controls its layout.
5 */
6
7SECTIONS
8{
9 . = VDSO_PRELINK + SIZEOF_HEADERS;
10
11 .hash : { *(.hash) } :text
12 .gnu.hash : { *(.gnu.hash) }
13 .dynsym : { *(.dynsym) }
14 .dynstr : { *(.dynstr) }
15 .gnu.version : { *(.gnu.version) }
16 .gnu.version_d : { *(.gnu.version_d) }
17 .gnu.version_r : { *(.gnu.version_r) }
18
19 .note : { *(.note.*) } :text :note
20
21 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
22 .eh_frame : { KEEP (*(.eh_frame)) } :text
23
24 .dynamic : { *(.dynamic) } :text :dynamic
25
26 .rodata : { *(.rodata*) } :text
27 .data : {
28 *(.data*)
29 *(.sdata*)
30 *(.got.plt) *(.got)
31 *(.gnu.linkonce.d.*)
32 *(.bss*)
33 *(.dynbss*)
34 *(.gnu.linkonce.b.*)
35 }
36
37 .altinstructions : { *(.altinstructions) }
38 .altinstr_replacement : { *(.altinstr_replacement) }
39
40 /*
41 * Align the actual code well away from the non-instruction data.
42 * This is the best thing for the I-cache.
43 */
44 . = ALIGN(0x100);
45
46 .text : { *(.text*) } :text =0x90909090
47}
48
49/*
50 * Very old versions of ld do not recognize this name token; use the constant.
51 */
52#define PT_GNU_EH_FRAME 0x6474e550
53
54/*
55 * We must supply the ELF program headers explicitly to get just one
56 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
57 */
58PHDRS
59{
60 text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
61 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
62 note PT_NOTE FLAGS(4); /* PF_R */
63 eh_frame_hdr PT_GNU_EH_FRAME;
64}
diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S
deleted file mode 100644
index 2dc2cdb84d67..000000000000
--- a/arch/x86/vdso/vdso-start.S
+++ /dev/null
@@ -1,2 +0,0 @@
1 .globl vdso_kernel_start
2vdso_kernel_start:
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 667d3245d972..4e5dd3b4de7f 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -1,79 +1,37 @@
1/* 1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared 2 * Linker script for 64-bit vDSO.
3 * object prelinked to its virtual address, and with only one read-only 3 * We #include the file to define the layout details.
4 * segment (that fits in one page). This script controls its layout. 4 * Here we only choose the prelinked virtual address.
5 *
6 * This file defines the version script giving the user-exported symbols in
7 * the DSO. We can define local symbols here called VDSO* to make their
8 * values visible using the asm-x86/vdso.h macros from the kernel proper.
5 */ 9 */
6#include <asm/asm-offsets.h>
7#include "voffset.h"
8 10
9#define VDSO_PRELINK 0xffffffffff700000 11#define VDSO_PRELINK 0xffffffffff700000
10 12#include "vdso-layout.lds.S"
11SECTIONS
12{
13 . = VDSO_PRELINK + SIZEOF_HEADERS;
14
15 .hash : { *(.hash) } :text
16 .gnu.hash : { *(.gnu.hash) }
17 .dynsym : { *(.dynsym) }
18 .dynstr : { *(.dynstr) }
19 .gnu.version : { *(.gnu.version) }
20 .gnu.version_d : { *(.gnu.version_d) }
21 .gnu.version_r : { *(.gnu.version_r) }
22
23 /* This linker script is used both with -r and with -shared.
24 For the layouts to match, we need to skip more than enough
25 space for the dynamic symbol table et al. If this amount
26 is insufficient, ld -shared will barf. Just increase it here. */
27 . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
28
29 .text : { *(.text*) } :text
30 .rodata : { *(.rodata*) } :text
31 .data : {
32 *(.data*)
33 *(.sdata*)
34 *(.bss*)
35 *(.dynbss*)
36 } :text
37
38 .altinstructions : { *(.altinstructions) } :text
39 .altinstr_replacement : { *(.altinstr_replacement) } :text
40
41 .note : { *(.note.*) } :text :note
42 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
43 .eh_frame : { KEEP (*(.eh_frame)) } :text
44 .dynamic : { *(.dynamic) } :text :dynamic
45 .useless : {
46 *(.got.plt) *(.got)
47 *(.gnu.linkonce.d.*)
48 *(.gnu.linkonce.b.*)
49 } :text
50}
51 13
52/* 14/*
53 * We must supply the ELF program headers explicitly to get just one 15 * This controls what userland symbols we export from the vDSO.
54 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
55 */ 16 */
56PHDRS 17VERSION {
57{ 18 LINUX_2.6 {
58 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ 19 global:
59 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ 20 clock_gettime;
60 note PT_NOTE FLAGS(4); /* PF_R */ 21 __vdso_clock_gettime;
61 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ 22 gettimeofday;
23 __vdso_gettimeofday;
24 getcpu;
25 __vdso_getcpu;
26 local: *;
27 };
62} 28}
63 29
30VDSO64_PRELINK = VDSO_PRELINK;
31
64/* 32/*
65 * This controls what symbols we export from the DSO. 33 * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
66 */ 34 */
67VERSION 35#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
68{ 36#include "vextern.h"
69 LINUX_2.6 { 37#undef VEXTERN
70 global:
71 clock_gettime;
72 __vdso_clock_gettime;
73 gettimeofday;
74 __vdso_gettimeofday;
75 getcpu;
76 __vdso_getcpu;
77 local: *;
78 };
79}
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/vdso/vdso32-setup.c
index 5a2d951e2608..348f1341e1c8 100644
--- a/arch/x86/kernel/sysenter_32.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -23,6 +23,8 @@
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/elf.h> 24#include <asm/elf.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26#include <asm/vdso.h>
27#include <asm/proto.h>
26 28
27enum { 29enum {
28 VDSO_DISABLED = 0, 30 VDSO_DISABLED = 0,
@@ -36,14 +38,24 @@ enum {
36#define VDSO_DEFAULT VDSO_ENABLED 38#define VDSO_DEFAULT VDSO_ENABLED
37#endif 39#endif
38 40
41#ifdef CONFIG_X86_64
42#define vdso_enabled sysctl_vsyscall32
43#define arch_setup_additional_pages syscall32_setup_pages
44#endif
45
46/*
47 * This is the difference between the prelinked addresses in the vDSO images
48 * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
49 * in the user address space.
50 */
51#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
52
39/* 53/*
40 * Should the kernel map a VDSO page into processes and pass its 54 * Should the kernel map a VDSO page into processes and pass its
41 * address down to glibc upon exec()? 55 * address down to glibc upon exec()?
42 */ 56 */
43unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; 57unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
44 58
45EXPORT_SYMBOL_GPL(vdso_enabled);
46
47static int __init vdso_setup(char *s) 59static int __init vdso_setup(char *s)
48{ 60{
49 vdso_enabled = simple_strtoul(s, NULL, 0); 61 vdso_enabled = simple_strtoul(s, NULL, 0);
@@ -51,9 +63,18 @@ static int __init vdso_setup(char *s)
51 return 1; 63 return 1;
52} 64}
53 65
54__setup("vdso=", vdso_setup); 66/*
67 * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
68 * behavior on both 64-bit and 32-bit kernels.
69 * On 32-bit kernels, vdso=[012] means the same thing.
70 */
71__setup("vdso32=", vdso_setup);
72
73#ifdef CONFIG_X86_32
74__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
55 75
56extern asmlinkage void sysenter_entry(void); 76EXPORT_SYMBOL_GPL(vdso_enabled);
77#endif
57 78
58static __init void reloc_symtab(Elf32_Ehdr *ehdr, 79static __init void reloc_symtab(Elf32_Ehdr *ehdr,
59 unsigned offset, unsigned size) 80 unsigned offset, unsigned size)
@@ -78,7 +99,7 @@ static __init void reloc_symtab(Elf32_Ehdr *ehdr,
78 case STT_FUNC: 99 case STT_FUNC:
79 case STT_SECTION: 100 case STT_SECTION:
80 case STT_FILE: 101 case STT_FILE:
81 sym->st_value += VDSO_HIGH_BASE; 102 sym->st_value += VDSO_ADDR_ADJUST;
82 } 103 }
83 } 104 }
84} 105}
@@ -104,7 +125,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
104 case DT_VERNEED: 125 case DT_VERNEED:
105 case DT_ADDRRNGLO ... DT_ADDRRNGHI: 126 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
106 /* definitely pointers needing relocation */ 127 /* definitely pointers needing relocation */
107 dyn->d_un.d_ptr += VDSO_HIGH_BASE; 128 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
108 break; 129 break;
109 130
110 case DT_ENCODING ... OLD_DT_LOOS-1: 131 case DT_ENCODING ... OLD_DT_LOOS-1:
@@ -113,7 +134,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
113 they're even */ 134 they're even */
114 if (dyn->d_tag >= DT_ENCODING && 135 if (dyn->d_tag >= DT_ENCODING &&
115 (dyn->d_tag & 1) == 0) 136 (dyn->d_tag & 1) == 0)
116 dyn->d_un.d_ptr += VDSO_HIGH_BASE; 137 dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
117 break; 138 break;
118 139
119 case DT_VERDEFNUM: 140 case DT_VERDEFNUM:
@@ -142,15 +163,15 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
142 int i; 163 int i;
143 164
144 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || 165 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
145 !elf_check_arch(ehdr) || 166 !elf_check_arch_ia32(ehdr) ||
146 ehdr->e_type != ET_DYN); 167 ehdr->e_type != ET_DYN);
147 168
148 ehdr->e_entry += VDSO_HIGH_BASE; 169 ehdr->e_entry += VDSO_ADDR_ADJUST;
149 170
150 /* rebase phdrs */ 171 /* rebase phdrs */
151 phdr = (void *)ehdr + ehdr->e_phoff; 172 phdr = (void *)ehdr + ehdr->e_phoff;
152 for (i = 0; i < ehdr->e_phnum; i++) { 173 for (i = 0; i < ehdr->e_phnum; i++) {
153 phdr[i].p_vaddr += VDSO_HIGH_BASE; 174 phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
154 175
155 /* relocate dynamic stuff */ 176 /* relocate dynamic stuff */
156 if (phdr[i].p_type == PT_DYNAMIC) 177 if (phdr[i].p_type == PT_DYNAMIC)
@@ -163,7 +184,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
163 if (!(shdr[i].sh_flags & SHF_ALLOC)) 184 if (!(shdr[i].sh_flags & SHF_ALLOC))
164 continue; 185 continue;
165 186
166 shdr[i].sh_addr += VDSO_HIGH_BASE; 187 shdr[i].sh_addr += VDSO_ADDR_ADJUST;
167 188
168 if (shdr[i].sh_type == SHT_SYMTAB || 189 if (shdr[i].sh_type == SHT_SYMTAB ||
169 shdr[i].sh_type == SHT_DYNSYM) 190 shdr[i].sh_type == SHT_DYNSYM)
@@ -172,6 +193,45 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
172 } 193 }
173} 194}
174 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1];
203
204#ifdef CONFIG_X86_64
205
206static int use_sysenter __read_mostly = -1;
207
208#define vdso32_sysenter() (use_sysenter > 0)
209
210/* May not be __init: called during resume */
211void syscall32_cpu_init(void)
212{
213 if (use_sysenter < 0)
214 use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
215
216 /* Load these always in case some future AMD CPU supports
217 SYSENTER from compat mode too. */
218 checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
219 checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
220 checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
221
222 wrmsrl(MSR_CSTAR, ia32_cstar_target);
223}
224
225#define compat_uses_vma 1
226
227static inline void map_compat_vdso(int map)
228{
229}
230
231#else /* CONFIG_X86_32 */
232
233#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
234
175void enable_sep_cpu(void) 235void enable_sep_cpu(void)
176{ 236{
177 int cpu = get_cpu(); 237 int cpu = get_cpu();
@@ -183,10 +243,10 @@ void enable_sep_cpu(void)
183 } 243 }
184 244
185 tss->x86_tss.ss1 = __KERNEL_CS; 245 tss->x86_tss.ss1 = __KERNEL_CS;
186 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; 246 tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
187 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 247 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
188 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); 248 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
189 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); 249 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
190 put_cpu(); 250 put_cpu();
191} 251}
192 252
@@ -209,13 +269,7 @@ static int __init gate_vma_init(void)
209 return 0; 269 return 0;
210} 270}
211 271
212/* 272#define compat_uses_vma 0
213 * These symbols are defined by vsyscall.o to mark the bounds
214 * of the ELF DSO images included therein.
215 */
216extern const char vsyscall_int80_start, vsyscall_int80_end;
217extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
218static struct page *syscall_pages[1];
219 273
220static void map_compat_vdso(int map) 274static void map_compat_vdso(int map)
221{ 275{
@@ -226,31 +280,35 @@ static void map_compat_vdso(int map)
226 280
227 vdso_mapped = map; 281 vdso_mapped = map;
228 282
229 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, 283 __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
230 map ? PAGE_READONLY_EXEC : PAGE_NONE); 284 map ? PAGE_READONLY_EXEC : PAGE_NONE);
231 285
232 /* flush stray tlbs */ 286 /* flush stray tlbs */
233 flush_tlb_all(); 287 flush_tlb_all();
234} 288}
235 289
290#endif /* CONFIG_X86_64 */
291
236int __init sysenter_setup(void) 292int __init sysenter_setup(void)
237{ 293{
238 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); 294 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
239 const void *vsyscall; 295 const void *vsyscall;
240 size_t vsyscall_len; 296 size_t vsyscall_len;
241 297
242 syscall_pages[0] = virt_to_page(syscall_page); 298 vdso32_pages[0] = virt_to_page(syscall_page);
243 299
300#ifdef CONFIG_X86_32
244 gate_vma_init(); 301 gate_vma_init();
245 302
246 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 303 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
304#endif
247 305
248 if (!boot_cpu_has(X86_FEATURE_SEP)) { 306 if (!vdso32_sysenter()) {
249 vsyscall = &vsyscall_int80_start; 307 vsyscall = &vdso32_default_start;
250 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; 308 vsyscall_len = &vdso32_default_end - &vdso32_default_start;
251 } else { 309 } else {
252 vsyscall = &vsyscall_sysenter_start; 310 vsyscall = &vdso32_sysenter_start;
253 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; 311 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
254 } 312 }
255 313
256 memcpy(syscall_page, vsyscall, vsyscall_len); 314 memcpy(syscall_page, vsyscall, vsyscall_len);
@@ -259,9 +317,6 @@ int __init sysenter_setup(void)
259 return 0; 317 return 0;
260} 318}
261 319
262/* Defined in vsyscall-sysenter.S */
263extern void SYSENTER_RETURN;
264
265/* Setup a VMA at program startup for the vsyscall page */ 320/* Setup a VMA at program startup for the vsyscall page */
266int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 321int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
267{ 322{
@@ -286,7 +341,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
286 ret = addr; 341 ret = addr;
287 goto up_fail; 342 goto up_fail;
288 } 343 }
344 }
289 345
346 if (compat_uses_vma || !compat) {
290 /* 347 /*
291 * MAYWRITE to allow gdb to COW and set breakpoints 348 * MAYWRITE to allow gdb to COW and set breakpoints
292 * 349 *
@@ -300,7 +357,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
300 VM_READ|VM_EXEC| 357 VM_READ|VM_EXEC|
301 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 358 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
302 VM_ALWAYSDUMP, 359 VM_ALWAYSDUMP,
303 syscall_pages); 360 vdso32_pages);
304 361
305 if (ret) 362 if (ret)
306 goto up_fail; 363 goto up_fail;
@@ -308,7 +365,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
308 365
309 current->mm->context.vdso = (void *)addr; 366 current->mm->context.vdso = (void *)addr;
310 current_thread_info()->sysenter_return = 367 current_thread_info()->sysenter_return =
311 (void *)VDSO_SYM(&SYSENTER_RETURN); 368 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
312 369
313 up_fail: 370 up_fail:
314 up_write(&mm->mmap_sem); 371 up_write(&mm->mmap_sem);
@@ -316,6 +373,45 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
316 return ret; 373 return ret;
317} 374}
318 375
376#ifdef CONFIG_X86_64
377
378__initcall(sysenter_setup);
379
380#ifdef CONFIG_SYSCTL
381/* Register vsyscall32 into the ABI table */
382#include <linux/sysctl.h>
383
384static ctl_table abi_table2[] = {
385 {
386 .procname = "vsyscall32",
387 .data = &sysctl_vsyscall32,
388 .maxlen = sizeof(int),
389 .mode = 0644,
390 .proc_handler = proc_dointvec
391 },
392 {}
393};
394
395static ctl_table abi_root_table2[] = {
396 {
397 .ctl_name = CTL_ABI,
398 .procname = "abi",
399 .mode = 0555,
400 .child = abi_table2
401 },
402 {}
403};
404
405static __init int ia32_binfmt_init(void)
406{
407 register_sysctl_table(abi_root_table2);
408 return 0;
409}
410__initcall(ia32_binfmt_init);
411#endif
412
413#else /* CONFIG_X86_32 */
414
319const char *arch_vma_name(struct vm_area_struct *vma) 415const char *arch_vma_name(struct vm_area_struct *vma)
320{ 416{
321 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) 417 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
@@ -344,3 +440,5 @@ int in_gate_area_no_task(unsigned long addr)
344{ 440{
345 return 0; 441 return 0;
346} 442}
443
444#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
new file mode 100644
index 000000000000..1e36f72cab86
--- /dev/null
+++ b/arch/x86/vdso/vdso32.S
@@ -0,0 +1,19 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vdso32_default_start, vdso32_default_end
6vdso32_default_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else
10 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif
12vdso32_default_end:
13
14 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start:
16 .incbin "arch/x86/vdso/vdso32-sysenter.so"
17vdso32_sysenter_end:
18
19__FINIT
diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore
new file mode 100644
index 000000000000..e45fba9d0ced
--- /dev/null
+++ b/arch/x86/vdso/vdso32/.gitignore
@@ -0,0 +1 @@
vdso32.lds
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/vdso/vdso32/int80.S
index 103cab6aa7c0..b15b7c01aedb 100644
--- a/arch/x86/kernel/vsyscall-int80_32.S
+++ b/arch/x86/vdso/vdso32/int80.S
@@ -1,15 +1,15 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method. 2 * Code for the vDSO. This version uses the old int $0x80 method.
3 * 3 *
4 * NOTE: 4 * First get the common code for the sigreturn entry points.
5 * 1) __kernel_vsyscall _must_ be first in this page. 5 * This must come first.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */ 6 */
7#include "sigreturn.S"
9 8
10 .text 9 .text
11 .globl __kernel_vsyscall 10 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function 11 .type __kernel_vsyscall,@function
12 ALIGN
13__kernel_vsyscall: 13__kernel_vsyscall:
14.LSTART_vsyscall: 14.LSTART_vsyscall:
15 int $0x80 15 int $0x80
@@ -47,7 +47,10 @@ __kernel_vsyscall:
47.LENDFDEDLSI: 47.LENDFDEDLSI:
48 .previous 48 .previous
49 49
50/* 50 /*
51 * Get the common code for the sigreturn entry points. 51 * Pad out the segment to match the size of the sysenter.S version.
52 */ 52 */
53#include "vsyscall-sigreturn_32.S" 53VDSO32_vsyscall_eh_frame_size = 0x40
54 .section .data,"aw",@progbits
55 .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
56 .previous
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/vdso/vdso32/note.S
index fcf376a37f79..c83f25734696 100644
--- a/arch/x86/kernel/vsyscall-note_32.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -33,12 +33,11 @@ ELFNOTE_END
33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. 33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
34 */ 34 */
35 35
36#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ 36#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
37 37
38 .globl VDSO_NOTE_MASK
39ELFNOTE_START(GNU, 2, "a") 38ELFNOTE_START(GNU, 2, "a")
40 .long 1 /* ncaps */ 39 .long 1 /* ncaps */
41VDSO_NOTE_MASK: 40VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
42 .long 0 /* mask */ 41 .long 0 /* mask */
43 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ 42 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
44ELFNOTE_END 43ELFNOTE_END
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/vdso/vdso32/sigreturn.S
index a92262f41659..31776d0efc8c 100644
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -1,41 +1,42 @@
1/* 1/*
2 * Common code for the sigreturn entry points on the vsyscall page. 2 * Common code for the sigreturn entry points in vDSO images.
3 * So far this code is the same for both int80 and sysenter versions. 3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the 4 * This file is #include'd by int80.S et al to define them first thing.
5 * vsyscall entry point. The kernel assumes that the addresses of these 5 * The kernel assumes that the addresses of these routines are constant
6 * routines are constant for all vsyscall implementations. 6 * for all vDSO implementations.
7 */ 7 */
8 8
9#include <asm/unistd.h> 9#include <linux/linkage.h>
10#include <asm/unistd_32.h>
10#include <asm/asm-offsets.h> 11#include <asm/asm-offsets.h>
11 12
12 13#ifndef SYSCALL_ENTER_KERNEL
13/* XXX 14#define SYSCALL_ENTER_KERNEL int $0x80
14 Should these be named "_sigtramp" or something? 15#endif
15*/
16 16
17 .text 17 .text
18 .org __kernel_vsyscall+32,0x90
19 .globl __kernel_sigreturn 18 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function 19 .type __kernel_sigreturn,@function
20 ALIGN
21__kernel_sigreturn: 21__kernel_sigreturn:
22.LSTART_sigreturn: 22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */ 23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax 24 movl $__NR_sigreturn, %eax
25 int $0x80 25 SYSCALL_ENTER_KERNEL
26.LEND_sigreturn: 26.LEND_sigreturn:
27 nop
27 .size __kernel_sigreturn,.-.LSTART_sigreturn 28 .size __kernel_sigreturn,.-.LSTART_sigreturn
28 29
29 .balign 32
30 .globl __kernel_rt_sigreturn 30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function 31 .type __kernel_rt_sigreturn,@function
32 ALIGN
32__kernel_rt_sigreturn: 33__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn: 34.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax 35 movl $__NR_rt_sigreturn, %eax
35 int $0x80 36 SYSCALL_ENTER_KERNEL
36.LEND_rt_sigreturn: 37.LEND_rt_sigreturn:
38 nop
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn 39 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .balign 32
39 .previous 40 .previous
40 41
41 .section .eh_frame,"a",@progbits 42 .section .eh_frame,"a",@progbits
@@ -70,9 +71,9 @@ __kernel_rt_sigreturn:
70 be the value of the stack pointer in the caller. This means 71 be the value of the stack pointer in the caller. This means
71 that we must define the CFA of this body of code to be the 72 that we must define the CFA of this body of code to be the
72 saved value of the stack pointer in the sigcontext. Which 73 saved value of the stack pointer in the sigcontext. Which
73 also means that there is no fixed relation to the other 74 also means that there is no fixed relation to the other
74 saved registers, which means that we must use DW_CFA_expression 75 saved registers, which means that we must use DW_CFA_expression
75 to compute their addresses. It also means that when we 76 to compute their addresses. It also means that when we
76 adjust the stack with the popl, we have to do it all over again. */ 77 adjust the stack with the popl, we have to do it all over again. */
77 78
78#define do_cfa_expr(offset) \ 79#define do_cfa_expr(offset) \
@@ -91,27 +92,27 @@ __kernel_rt_sigreturn:
91 .sleb128 offset; /* offset */ \ 92 .sleb128 offset; /* offset */ \
921: 931:
93 94
94 do_cfa_expr(SIGCONTEXT_esp+4) 95 do_cfa_expr(IA32_SIGCONTEXT_sp+4)
95 do_expr(0, SIGCONTEXT_eax+4) 96 do_expr(0, IA32_SIGCONTEXT_ax+4)
96 do_expr(1, SIGCONTEXT_ecx+4) 97 do_expr(1, IA32_SIGCONTEXT_cx+4)
97 do_expr(2, SIGCONTEXT_edx+4) 98 do_expr(2, IA32_SIGCONTEXT_dx+4)
98 do_expr(3, SIGCONTEXT_ebx+4) 99 do_expr(3, IA32_SIGCONTEXT_bx+4)
99 do_expr(5, SIGCONTEXT_ebp+4) 100 do_expr(5, IA32_SIGCONTEXT_bp+4)
100 do_expr(6, SIGCONTEXT_esi+4) 101 do_expr(6, IA32_SIGCONTEXT_si+4)
101 do_expr(7, SIGCONTEXT_edi+4) 102 do_expr(7, IA32_SIGCONTEXT_di+4)
102 do_expr(8, SIGCONTEXT_eip+4) 103 do_expr(8, IA32_SIGCONTEXT_ip+4)
103 104
104 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ 105 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
105 106
106 do_cfa_expr(SIGCONTEXT_esp) 107 do_cfa_expr(IA32_SIGCONTEXT_sp)
107 do_expr(0, SIGCONTEXT_eax) 108 do_expr(0, IA32_SIGCONTEXT_ax)
108 do_expr(1, SIGCONTEXT_ecx) 109 do_expr(1, IA32_SIGCONTEXT_cx)
109 do_expr(2, SIGCONTEXT_edx) 110 do_expr(2, IA32_SIGCONTEXT_dx)
110 do_expr(3, SIGCONTEXT_ebx) 111 do_expr(3, IA32_SIGCONTEXT_bx)
111 do_expr(5, SIGCONTEXT_ebp) 112 do_expr(5, IA32_SIGCONTEXT_bp)
112 do_expr(6, SIGCONTEXT_esi) 113 do_expr(6, IA32_SIGCONTEXT_si)
113 do_expr(7, SIGCONTEXT_edi) 114 do_expr(7, IA32_SIGCONTEXT_di)
114 do_expr(8, SIGCONTEXT_eip) 115 do_expr(8, IA32_SIGCONTEXT_ip)
115 116
116 .align 4 117 .align 4
117.LENDFDEDLSI1: 118.LENDFDEDLSI1:
@@ -128,15 +129,15 @@ __kernel_rt_sigreturn:
128 slightly less complicated than the above, since we don't 129 slightly less complicated than the above, since we don't
129 modify the stack pointer in the process. */ 130 modify the stack pointer in the process. */
130 131
131 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) 132 do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
132 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) 133 do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
133 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) 134 do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
134 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) 135 do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
135 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) 136 do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
136 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) 137 do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
137 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) 138 do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
138 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) 139 do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
139 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) 140 do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
140 141
141 .align 4 142 .align 4
142.LENDFDEDLSI2: 143.LENDFDEDLSI2:
diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/vdso/vdso32/syscall.S
index cf9ef678de3e..5415b5613d55 100644
--- a/arch/x86/ia32/vsyscall-syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -1,16 +1,18 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the syscall instruction. 2 * Code for the vDSO. This version uses the syscall instruction.
3 *
4 * First get the common code for the sigreturn entry points.
5 * This must come first.
3 */ 6 */
7#define SYSCALL_ENTER_KERNEL syscall
8#include "sigreturn.S"
4 9
5#include <asm/ia32_unistd.h>
6#include <asm/asm-offsets.h>
7#include <asm/segment.h> 10#include <asm/segment.h>
8 11
9 .code32
10 .text 12 .text
11 .section .text.vsyscall,"ax"
12 .globl __kernel_vsyscall 13 .globl __kernel_vsyscall
13 .type __kernel_vsyscall,@function 14 .type __kernel_vsyscall,@function
15 ALIGN
14__kernel_vsyscall: 16__kernel_vsyscall:
15.LSTART_vsyscall: 17.LSTART_vsyscall:
16 push %ebp 18 push %ebp
@@ -64,6 +66,12 @@ __kernel_vsyscall:
64 .uleb128 4 66 .uleb128 4
65 .align 4 67 .align 4
66.LENDFDE1: 68.LENDFDE1:
69 .previous
67 70
68#define SYSCALL_ENTER_KERNEL syscall 71 /*
69#include "vsyscall-sigreturn.S" 72 * Pad out the segment to match the size of the sysenter.S version.
73 */
74VDSO32_vsyscall_eh_frame_size = 0x40
75 .section .data,"aw",@progbits
76 .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
77 .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/vdso/vdso32/sysenter.S
index ed879bf42995..e2800affa754 100644
--- a/arch/x86/kernel/vsyscall-sysenter_32.S
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -1,11 +1,10 @@
1/* 1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction. 2 * Code for the vDSO. This version uses the sysenter instruction.
3 * 3 *
4 * NOTE: 4 * First get the common code for the sigreturn entry points.
5 * 1) __kernel_vsyscall _must_ be first in this page. 5 * This must come first.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */ 6 */
7#include "sigreturn.S"
9 8
10/* 9/*
11 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use 10 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
@@ -23,11 +22,12 @@
23 * arg6 from the stack. 22 * arg6 from the stack.
24 * 23 *
25 * You can not use this vsyscall for the clone() syscall because the 24 * You can not use this vsyscall for the clone() syscall because the
26 * three dwords on the parent stack do not get copied to the child. 25 * three words on the parent stack do not get copied to the child.
27 */ 26 */
28 .text 27 .text
29 .globl __kernel_vsyscall 28 .globl __kernel_vsyscall
30 .type __kernel_vsyscall,@function 29 .type __kernel_vsyscall,@function
30 ALIGN
31__kernel_vsyscall: 31__kernel_vsyscall:
32.LSTART_vsyscall: 32.LSTART_vsyscall:
33 push %ecx 33 push %ecx
@@ -45,8 +45,7 @@ __kernel_vsyscall:
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ 45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel 46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */ 47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ 48VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
49SYSENTER_RETURN:
50 pop %ebp 49 pop %ebp
51.Lpop_ebp: 50.Lpop_ebp:
52 pop %edx 51 pop %edx
@@ -85,38 +84,33 @@ SYSENTER_RETURN:
85 .uleb128 0 84 .uleb128 0
86 /* What follows are the instructions for the table generation. 85 /* What follows are the instructions for the table generation.
87 We have to record all changes of the stack pointer. */ 86 We have to record all changes of the stack pointer. */
88 .byte 0x04 /* DW_CFA_advance_loc4 */ 87 .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
89 .long .Lpush_ecx-.LSTART_vsyscall
90 .byte 0x0e /* DW_CFA_def_cfa_offset */ 88 .byte 0x0e /* DW_CFA_def_cfa_offset */
91 .byte 0x08 /* RA at offset 8 now */ 89 .byte 0x08 /* RA at offset 8 now */
92 .byte 0x04 /* DW_CFA_advance_loc4 */ 90 .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
93 .long .Lpush_edx-.Lpush_ecx
94 .byte 0x0e /* DW_CFA_def_cfa_offset */ 91 .byte 0x0e /* DW_CFA_def_cfa_offset */
95 .byte 0x0c /* RA at offset 12 now */ 92 .byte 0x0c /* RA at offset 12 now */
96 .byte 0x04 /* DW_CFA_advance_loc4 */ 93 .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
97 .long .Lenter_kernel-.Lpush_edx
98 .byte 0x0e /* DW_CFA_def_cfa_offset */ 94 .byte 0x0e /* DW_CFA_def_cfa_offset */
99 .byte 0x10 /* RA at offset 16 now */ 95 .byte 0x10 /* RA at offset 16 now */
100 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ 96 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
101 /* Finally the epilogue. */ 97 /* Finally the epilogue. */
102 .byte 0x04 /* DW_CFA_advance_loc4 */ 98 .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
103 .long .Lpop_ebp-.Lenter_kernel
104 .byte 0x0e /* DW_CFA_def_cfa_offset */ 99 .byte 0x0e /* DW_CFA_def_cfa_offset */
105 .byte 0x0c /* RA at offset 12 now */ 100 .byte 0x0c /* RA at offset 12 now */
106 .byte 0xc5 /* DW_CFA_restore %ebp */ 101 .byte 0xc5 /* DW_CFA_restore %ebp */
107 .byte 0x04 /* DW_CFA_advance_loc4 */ 102 .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
108 .long .Lpop_edx-.Lpop_ebp
109 .byte 0x0e /* DW_CFA_def_cfa_offset */ 103 .byte 0x0e /* DW_CFA_def_cfa_offset */
110 .byte 0x08 /* RA at offset 8 now */ 104 .byte 0x08 /* RA at offset 8 now */
111 .byte 0x04 /* DW_CFA_advance_loc4 */ 105 .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
112 .long .Lpop_ecx-.Lpop_edx
113 .byte 0x0e /* DW_CFA_def_cfa_offset */ 106 .byte 0x0e /* DW_CFA_def_cfa_offset */
114 .byte 0x04 /* RA at offset 4 now */ 107 .byte 0x04 /* RA at offset 4 now */
115 .align 4 108 .align 4
116.LENDFDEDLSI: 109.LENDFDEDLSI:
117 .previous 110 .previous
118 111
119/* 112 /*
120 * Get the common code for the sigreturn entry points. 113 * Emit a symbol with the size of this .eh_frame data,
121 */ 114 * to verify it matches the other versions.
122#include "vsyscall-sigreturn_32.S" 115 */
116VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..976124bb5f92
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,37 @@
1/*
2 * Linker script for 32-bit vDSO.
3 * We #include the file to define the layout details.
4 * Here we only choose the prelinked virtual address.
5 *
6 * This file defines the version script giving the user-exported symbols in
7 * the DSO. We can define local symbols here called VDSO* to make their
8 * values visible using the asm-x86/vdso.h macros from the kernel proper.
9 */
10
11#define VDSO_PRELINK 0
12#include "../vdso-layout.lds.S"
13
14/* The ELF entry point can be used to set the AT_SYSINFO value. */
15ENTRY(__kernel_vsyscall);
16
17/*
18 * This controls what userland symbols we export from the vDSO.
19 */
20VERSION
21{
22 LINUX_2.5 {
23 global:
24 __kernel_vsyscall;
25 __kernel_sigreturn;
26 __kernel_rt_sigreturn;
27 local: *;
28 };
29}
30
31/*
32 * Symbols we define here called VDSO* get their values into vdso32-syms.h.
33 */
34VDSO32_PRELINK = VDSO_PRELINK;
35VDSO32_vsyscall = __kernel_vsyscall;
36VDSO32_sigreturn = __kernel_sigreturn;
37VDSO32_rt_sigreturn = __kernel_rt_sigreturn;
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 3b1ae1abfba9..c8097f17f8a9 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -15,11 +15,11 @@
15 15
16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 17{
18 unsigned int dummy, p; 18 unsigned int p;
19 19
20 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { 20 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
21 /* Load per CPU data from RDTSCP */ 21 /* Load per CPU data from RDTSCP */
22 rdtscp(dummy, dummy, p); 22 native_read_tscp(&p);
23 } else { 23 } else {
24 /* Load per CPU data from GDT */ 24 /* Load per CPU data from GDT */
25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ff9333e5fb08..3fdd51497a83 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -11,23 +11,20 @@
11#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 12#include <asm/vgtod.h>
13#include <asm/proto.h> 13#include <asm/proto.h>
14#include "voffset.h" 14#include <asm/vdso.h>
15 15
16int vdso_enabled = 1; 16#include "vextern.h" /* Just for VMAGIC. */
17
18#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
19#include "vextern.h"
20#undef VEXTERN 17#undef VEXTERN
21 18
22extern char vdso_kernel_start[], vdso_start[], vdso_end[]; 19int vdso_enabled = 1;
20
21extern char vdso_start[], vdso_end[];
23extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
24 23
25struct page **vdso_pages; 24struct page **vdso_pages;
26 25
27static inline void *var_ref(void *vbase, char *var, char *name) 26static inline void *var_ref(void *p, char *name)
28{ 27{
29 unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
30 void *p = vbase + offset;
31 if (*(void **)p != (void *)VMAGIC) { 28 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name); 29 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0; 30 vdso_enabled = 0;
@@ -62,9 +59,8 @@ static int __init init_vdso_vars(void)
62 vdso_enabled = 0; 59 vdso_enabled = 0;
63 } 60 }
64 61
65#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
66#define VEXTERN(x) \ 62#define VEXTERN(x) \
67 V(vdso_ ## x) = &__ ## x; 63 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h" 64#include "vextern.h"
69#undef VEXTERN 65#undef VEXTERN
70 return 0; 66 return 0;
diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h
deleted file mode 100644
index 4af67c79085f..000000000000
--- a/arch/x86/vdso/voffset.h
+++ /dev/null
@@ -1 +0,0 @@
1#define VDSO_TEXT_OFFSET 0x600
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index fbfa55ce0d55..4d5f2649bee4 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 depends on X86_32
8 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) 9 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
9 help 10 help
10 This is the Linux Xen port. Enabling this will allow the 11 This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 79ad15252150..de647bc6e74d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -141,8 +141,8 @@ static void __init xen_banner(void)
141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
142} 142}
143 143
144static void xen_cpuid(unsigned int *eax, unsigned int *ebx, 144static void xen_cpuid(unsigned int *ax, unsigned int *bx,
145 unsigned int *ecx, unsigned int *edx) 145 unsigned int *cx, unsigned int *dx)
146{ 146{
147 unsigned maskedx = ~0; 147 unsigned maskedx = ~0;
148 148
@@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
150 * Mask out inconvenient features, to try and disable as many 150 * Mask out inconvenient features, to try and disable as many
151 * unsupported kernel subsystems as possible. 151 * unsupported kernel subsystems as possible.
152 */ 152 */
153 if (*eax == 1) 153 if (*ax == 1)
154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
156 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 156 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
157 157
158 asm(XEN_EMULATE_PREFIX "cpuid" 158 asm(XEN_EMULATE_PREFIX "cpuid"
159 : "=a" (*eax), 159 : "=a" (*ax),
160 "=b" (*ebx), 160 "=b" (*bx),
161 "=c" (*ecx), 161 "=c" (*cx),
162 "=d" (*edx) 162 "=d" (*dx)
163 : "0" (*eax), "2" (*ecx)); 163 : "0" (*ax), "2" (*cx));
164 *edx &= maskedx; 164 *dx &= maskedx;
165} 165}
166 166
167static void xen_set_debugreg(int reg, unsigned long val) 167static void xen_set_debugreg(int reg, unsigned long val)
@@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void)
275 275
276static void xen_set_ldt(const void *addr, unsigned entries) 276static void xen_set_ldt(const void *addr, unsigned entries)
277{ 277{
278 unsigned long linear_addr = (unsigned long)addr;
279 struct mmuext_op *op; 278 struct mmuext_op *op;
280 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 279 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
281 280
282 op = mcs.args; 281 op = mcs.args;
283 op->cmd = MMUEXT_SET_LDT; 282 op->cmd = MMUEXT_SET_LDT;
284 if (linear_addr) { 283 op->arg1.linear_addr = (unsigned long)addr;
285 /* ldt my be vmalloced, use arbitrary_virt_to_machine */
286 xmaddr_t maddr;
287 maddr = arbitrary_virt_to_machine((unsigned long)addr);
288 linear_addr = (unsigned long)maddr.maddr;
289 }
290 op->arg1.linear_addr = linear_addr;
291 op->arg2.nr_ents = entries; 284 op->arg2.nr_ents = entries;
292 285
293 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 286 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
@@ -295,7 +288,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
295 xen_mc_issue(PARAVIRT_LAZY_CPU); 288 xen_mc_issue(PARAVIRT_LAZY_CPU);
296} 289}
297 290
298static void xen_load_gdt(const struct Xgt_desc_struct *dtr) 291static void xen_load_gdt(const struct desc_ptr *dtr)
299{ 292{
300 unsigned long *frames; 293 unsigned long *frames;
301 unsigned long va = dtr->address; 294 unsigned long va = dtr->address;
@@ -357,11 +350,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
357} 350}
358 351
359static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 352static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
360 u32 low, u32 high) 353 const void *ptr)
361{ 354{
362 unsigned long lp = (unsigned long)&dt[entrynum]; 355 unsigned long lp = (unsigned long)&dt[entrynum];
363 xmaddr_t mach_lp = virt_to_machine(lp); 356 xmaddr_t mach_lp = virt_to_machine(lp);
364 u64 entry = (u64)high << 32 | low; 357 u64 entry = *(u64 *)ptr;
365 358
366 preempt_disable(); 359 preempt_disable();
367 360
@@ -395,12 +388,11 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high,
395} 388}
396 389
397/* Locations of each CPU's IDT */ 390/* Locations of each CPU's IDT */
398static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); 391static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
399 392
400/* Set an IDT entry. If the entry is part of the current IDT, then 393/* Set an IDT entry. If the entry is part of the current IDT, then
401 also update Xen. */ 394 also update Xen. */
402static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, 395static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
403 u32 low, u32 high)
404{ 396{
405 unsigned long p = (unsigned long)&dt[entrynum]; 397 unsigned long p = (unsigned long)&dt[entrynum];
406 unsigned long start, end; 398 unsigned long start, end;
@@ -412,14 +404,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
412 404
413 xen_mc_flush(); 405 xen_mc_flush();
414 406
415 write_dt_entry(dt, entrynum, low, high); 407 native_write_idt_entry(dt, entrynum, g);
416 408
417 if (p >= start && (p + 8) <= end) { 409 if (p >= start && (p + 8) <= end) {
418 struct trap_info info[2]; 410 struct trap_info info[2];
411 u32 *desc = (u32 *)g;
419 412
420 info[1].address = 0; 413 info[1].address = 0;
421 414
422 if (cvt_gate_to_trap(entrynum, low, high, &info[0])) 415 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
423 if (HYPERVISOR_set_trap_table(info)) 416 if (HYPERVISOR_set_trap_table(info))
424 BUG(); 417 BUG();
425 } 418 }
@@ -427,7 +420,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
427 preempt_enable(); 420 preempt_enable();
428} 421}
429 422
430static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, 423static void xen_convert_trap_info(const struct desc_ptr *desc,
431 struct trap_info *traps) 424 struct trap_info *traps)
432{ 425{
433 unsigned in, out, count; 426 unsigned in, out, count;
@@ -446,7 +439,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
446 439
447void xen_copy_trap_info(struct trap_info *traps) 440void xen_copy_trap_info(struct trap_info *traps)
448{ 441{
449 const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); 442 const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
450 443
451 xen_convert_trap_info(desc, traps); 444 xen_convert_trap_info(desc, traps);
452} 445}
@@ -454,7 +447,7 @@ void xen_copy_trap_info(struct trap_info *traps)
454/* Load a new IDT into Xen. In principle this can be per-CPU, so we 447/* Load a new IDT into Xen. In principle this can be per-CPU, so we
455 hold a spinlock to protect the static traps[] array (static because 448 hold a spinlock to protect the static traps[] array (static because
456 it avoids allocation, and saves stack space). */ 449 it avoids allocation, and saves stack space). */
457static void xen_load_idt(const struct Xgt_desc_struct *desc) 450static void xen_load_idt(const struct desc_ptr *desc)
458{ 451{
459 static DEFINE_SPINLOCK(lock); 452 static DEFINE_SPINLOCK(lock);
460 static struct trap_info traps[257]; 453 static struct trap_info traps[257];
@@ -475,22 +468,21 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
475/* Write a GDT descriptor entry. Ignore LDT descriptors, since 468/* Write a GDT descriptor entry. Ignore LDT descriptors, since
476 they're handled differently. */ 469 they're handled differently. */
477static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 470static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
478 u32 low, u32 high) 471 const void *desc, int type)
479{ 472{
480 preempt_disable(); 473 preempt_disable();
481 474
482 switch ((high >> 8) & 0xff) { 475 switch (type) {
483 case DESCTYPE_LDT: 476 case DESC_LDT:
484 case DESCTYPE_TSS: 477 case DESC_TSS:
485 /* ignore */ 478 /* ignore */
486 break; 479 break;
487 480
488 default: { 481 default: {
489 xmaddr_t maddr = virt_to_machine(&dt[entry]); 482 xmaddr_t maddr = virt_to_machine(&dt[entry]);
490 u64 desc = (u64)high << 32 | low;
491 483
492 xen_mc_flush(); 484 xen_mc_flush();
493 if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) 485 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
494 BUG(); 486 BUG();
495 } 487 }
496 488
@@ -499,11 +491,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
499 preempt_enable(); 491 preempt_enable();
500} 492}
501 493
502static void xen_load_esp0(struct tss_struct *tss, 494static void xen_load_sp0(struct tss_struct *tss,
503 struct thread_struct *thread) 495 struct thread_struct *thread)
504{ 496{
505 struct multicall_space mcs = xen_mc_entry(0); 497 struct multicall_space mcs = xen_mc_entry(0);
506 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); 498 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
507 xen_mc_issue(PARAVIRT_LAZY_CPU); 499 xen_mc_issue(PARAVIRT_LAZY_CPU);
508} 500}
509 501
@@ -521,12 +513,12 @@ static void xen_io_delay(void)
521} 513}
522 514
523#ifdef CONFIG_X86_LOCAL_APIC 515#ifdef CONFIG_X86_LOCAL_APIC
524static unsigned long xen_apic_read(unsigned long reg) 516static u32 xen_apic_read(unsigned long reg)
525{ 517{
526 return 0; 518 return 0;
527} 519}
528 520
529static void xen_apic_write(unsigned long reg, unsigned long val) 521static void xen_apic_write(unsigned long reg, u32 val)
530{ 522{
531 /* Warn to see if there's any stray references */ 523 /* Warn to see if there's any stray references */
532 WARN_ON(1); 524 WARN_ON(1);
@@ -666,6 +658,13 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 658 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
667} 659}
668 660
661/* Early release_pt assumes that all pts are pinned, since there's
662 only init_mm and anything attached to that is pinned. */
663static void xen_release_pt_init(u32 pfn)
664{
665 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
666}
667
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn) 668static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{ 669{
671 struct mmuext_op op; 670 struct mmuext_op op;
@@ -677,7 +676,7 @@ static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
677 676
678/* This needs to make sure the new pte page is pinned iff its being 677/* This needs to make sure the new pte page is pinned iff its being
679 attached to a pinned pagetable. */ 678 attached to a pinned pagetable. */
680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 679static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
681{ 680{
682 struct page *page = pfn_to_page(pfn); 681 struct page *page = pfn_to_page(pfn);
683 682
@@ -686,7 +685,7 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
686 685
687 if (!PageHighMem(page)) { 686 if (!PageHighMem(page)) {
688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 687 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 688 pin_pagetable_pfn(level, pfn);
690 } else 689 } else
691 /* make sure there are no stray mappings of 690 /* make sure there are no stray mappings of
692 this page */ 691 this page */
@@ -694,6 +693,16 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
694 } 693 }
695} 694}
696 695
696static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
697{
698 xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE);
699}
700
701static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
702{
703 xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE);
704}
705
697/* This should never happen until we're OK to use struct page */ 706/* This should never happen until we're OK to use struct page */
698static void xen_release_pt(u32 pfn) 707static void xen_release_pt(u32 pfn)
699{ 708{
@@ -796,6 +805,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
796 /* This will work as long as patching hasn't happened yet 805 /* This will work as long as patching hasn't happened yet
797 (which it hasn't) */ 806 (which it hasn't) */
798 pv_mmu_ops.alloc_pt = xen_alloc_pt; 807 pv_mmu_ops.alloc_pt = xen_alloc_pt;
808 pv_mmu_ops.alloc_pd = xen_alloc_pd;
809 pv_mmu_ops.release_pt = xen_release_pt;
810 pv_mmu_ops.release_pd = xen_release_pt;
799 pv_mmu_ops.set_pte = xen_set_pte; 811 pv_mmu_ops.set_pte = xen_set_pte;
800 812
801 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 813 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -953,7 +965,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
953 .read_pmc = native_read_pmc, 965 .read_pmc = native_read_pmc,
954 966
955 .iret = (void *)&hypercall_page[__HYPERVISOR_iret], 967 .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
956 .irq_enable_sysexit = NULL, /* never called */ 968 .irq_enable_syscall_ret = NULL, /* never called */
957 969
958 .load_tr_desc = paravirt_nop, 970 .load_tr_desc = paravirt_nop,
959 .set_ldt = xen_set_ldt, 971 .set_ldt = xen_set_ldt,
@@ -968,7 +980,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
968 .write_ldt_entry = xen_write_ldt_entry, 980 .write_ldt_entry = xen_write_ldt_entry,
969 .write_gdt_entry = xen_write_gdt_entry, 981 .write_gdt_entry = xen_write_gdt_entry,
970 .write_idt_entry = xen_write_idt_entry, 982 .write_idt_entry = xen_write_idt_entry,
971 .load_esp0 = xen_load_esp0, 983 .load_sp0 = xen_load_sp0,
972 984
973 .set_iopl_mask = xen_set_iopl_mask, 985 .set_iopl_mask = xen_set_iopl_mask,
974 .io_delay = xen_io_delay, 986 .io_delay = xen_io_delay,
@@ -1019,10 +1031,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1019 .pte_update_defer = paravirt_nop, 1031 .pte_update_defer = paravirt_nop,
1020 1032
1021 .alloc_pt = xen_alloc_pt_init, 1033 .alloc_pt = xen_alloc_pt_init,
1022 .release_pt = xen_release_pt, 1034 .release_pt = xen_release_pt_init,
1023 .alloc_pd = paravirt_nop, 1035 .alloc_pd = xen_alloc_pt_init,
1024 .alloc_pd_clone = paravirt_nop, 1036 .alloc_pd_clone = paravirt_nop,
1025 .release_pd = paravirt_nop, 1037 .release_pd = xen_release_pt_init,
1026 1038
1027#ifdef CONFIG_HIGHPTE 1039#ifdef CONFIG_HIGHPTE
1028 .kmap_atomic_pte = xen_kmap_atomic_pte, 1040 .kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
index 6d1da5809e6f..dcf613e17581 100644
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
465 * a bitset of words which contain pending event bits. The second 465 * a bitset of words which contain pending event bits. The second
466 * level is a bitset of pending events themselves. 466 * level is a bitset of pending events themselves.
467 */ 467 */
468fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) 468void xen_evtchn_do_upcall(struct pt_regs *regs)
469{ 469{
470 int cpu = get_cpu(); 470 int cpu = get_cpu();
471 struct shared_info *s = HYPERVISOR_shared_info; 471 struct shared_info *s = HYPERVISOR_shared_info;
@@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
487 int irq = evtchn_to_irq[port]; 487 int irq = evtchn_to_irq[port];
488 488
489 if (irq != -1) { 489 if (irq != -1) {
490 regs->orig_eax = ~irq; 490 regs->orig_ax = ~irq;
491 do_IRQ(regs); 491 do_IRQ(regs);
492 } 492 }
493 } 493 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0ac6c5dc49ba..45aa771e73a9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,7 +58,8 @@
58 58
59xmaddr_t arbitrary_virt_to_machine(unsigned long address) 59xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60{ 60{
61 pte_t *pte = lookup_address(address); 61 int level;
62 pte_t *pte = lookup_address(address, &level);
62 unsigned offset = address & PAGE_MASK; 63 unsigned offset = address & PAGE_MASK;
63 64
64 BUG_ON(pte == NULL); 65 BUG_ON(pte == NULL);
@@ -70,8 +71,9 @@ void make_lowmem_page_readonly(void *vaddr)
70{ 71{
71 pte_t *pte, ptev; 72 pte_t *pte, ptev;
72 unsigned long address = (unsigned long)vaddr; 73 unsigned long address = (unsigned long)vaddr;
74 int level;
73 75
74 pte = lookup_address(address); 76 pte = lookup_address(address, &level);
75 BUG_ON(pte == NULL); 77 BUG_ON(pte == NULL);
76 78
77 ptev = pte_wrprotect(*pte); 79 ptev = pte_wrprotect(*pte);
@@ -84,8 +86,9 @@ void make_lowmem_page_readwrite(void *vaddr)
84{ 86{
85 pte_t *pte, ptev; 87 pte_t *pte, ptev;
86 unsigned long address = (unsigned long)vaddr; 88 unsigned long address = (unsigned long)vaddr;
89 int level;
87 90
88 pte = lookup_address(address); 91 pte = lookup_address(address, &level);
89 BUG_ON(pte == NULL); 92 BUG_ON(pte == NULL);
90 93
91 ptev = pte_mkwrite(*pte); 94 ptev = pte_mkwrite(*pte);
@@ -241,12 +244,12 @@ unsigned long long xen_pgd_val(pgd_t pgd)
241 244
242pte_t xen_make_pte(unsigned long long pte) 245pte_t xen_make_pte(unsigned long long pte)
243{ 246{
244 if (pte & 1) 247 if (pte & _PAGE_PRESENT) {
245 pte = phys_to_machine(XPADDR(pte)).maddr; 248 pte = phys_to_machine(XPADDR(pte)).maddr;
249 pte &= ~(_PAGE_PCD | _PAGE_PWT);
250 }
246 251
247 pte &= ~_PAGE_PCD; 252 return (pte_t){ .pte = pte };
248
249 return (pte_t){ pte, pte >> 32 };
250} 253}
251 254
252pmd_t xen_make_pmd(unsigned long long pmd) 255pmd_t xen_make_pmd(unsigned long long pmd)
@@ -290,10 +293,10 @@ unsigned long xen_pgd_val(pgd_t pgd)
290 293
291pte_t xen_make_pte(unsigned long pte) 294pte_t xen_make_pte(unsigned long pte)
292{ 295{
293 if (pte & _PAGE_PRESENT) 296 if (pte & _PAGE_PRESENT) {
294 pte = phys_to_machine(XPADDR(pte)).maddr; 297 pte = phys_to_machine(XPADDR(pte)).maddr;
295 298 pte &= ~(_PAGE_PCD | _PAGE_PWT);
296 pte &= ~_PAGE_PCD; 299 }
297 300
298 return (pte_t){ pte }; 301 return (pte_t){ pte };
299} 302}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f84e77226646..3bad4773a2f3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11 11
12#include <asm/elf.h> 12#include <asm/elf.h>
13#include <asm/vdso.h>
13#include <asm/e820.h> 14#include <asm/e820.h>
14#include <asm/setup.h> 15#include <asm/setup.h>
15#include <asm/xen/hypervisor.h> 16#include <asm/xen/hypervisor.h>
@@ -59,12 +60,10 @@ static void xen_idle(void)
59/* 60/*
60 * Set the bit indicating "nosegneg" library variants should be used. 61 * Set the bit indicating "nosegneg" library variants should be used.
61 */ 62 */
62static void fiddle_vdso(void) 63static void __init fiddle_vdso(void)
63{ 64{
64 extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ 65 extern const char vdso32_default_start;
65 extern char vsyscall_int80_start; 66 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
66 u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
67 &vsyscall_int80_start);
68 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 67 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
69} 68}
70 69
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b131bcdcbe..aafc54437403 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void)
146 old memory can be recycled */ 146 old memory can be recycled */
147 make_lowmem_page_readwrite(&per_cpu__gdt_page); 147 make_lowmem_page_readwrite(&per_cpu__gdt_page);
148 148
149 for (cpu = 0; cpu < NR_CPUS; cpu++) { 149 for_each_possible_cpu(cpu) {
150 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 150 cpus_clear(per_cpu(cpu_sibling_map, cpu));
151 /* 151 /*
152 * cpu_core_map lives in a per cpu area that is cleared 152 * cpu_core_map lives in a per cpu area that is cleared
@@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
163{ 163{
164 unsigned cpu; 164 unsigned cpu;
165 165
166 for (cpu = 0; cpu < NR_CPUS; cpu++) { 166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /* 168 /*
169 * cpu_core_ map will be zeroed when the per 169 * cpu_core_ map will be zeroed when the per
@@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
239 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 239 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
240 240
241 ctxt->user_regs.cs = __KERNEL_CS; 241 ctxt->user_regs.cs = __KERNEL_CS;
242 ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); 242 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
243 243
244 ctxt->kernel_ss = __KERNEL_DS; 244 ctxt->kernel_ss = __KERNEL_DS;
245 ctxt->kernel_sp = idle->thread.esp0; 245 ctxt->kernel_sp = idle->thread.sp0;
246 246
247 ctxt->event_callback_cs = __KERNEL_CS; 247 ctxt->event_callback_cs = __KERNEL_CS;
248 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; 248 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d083ff5ef088..b3721fd6877b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -592,7 +592,7 @@ __init void xen_time_init(void)
592 set_normalized_timespec(&wall_to_monotonic, 592 set_normalized_timespec(&wall_to_monotonic,
593 -xtime.tv_sec, -xtime.tv_nsec); 593 -xtime.tv_sec, -xtime.tv_nsec);
594 594
595 tsc_disable = 0; 595 setup_force_cpu_cap(X86_FEATURE_TSC);
596 596
597 xen_setup_timer(cpu); 597 xen_setup_timer(cpu);
598 xen_setup_cpu_clockevents(); 598 xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index f8d6937db2ec..288d587ce73c 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -4,16 +4,18 @@
4#ifdef CONFIG_XEN 4#ifdef CONFIG_XEN
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h>
7#include <asm/boot.h> 8#include <asm/boot.h>
8#include <xen/interface/elfnote.h> 9#include <xen/interface/elfnote.h>
9 10
10.pushsection .init.text 11 __INIT
11ENTRY(startup_xen) 12ENTRY(startup_xen)
12 movl %esi,xen_start_info 13 movl %esi,xen_start_info
13 cld 14 cld
14 movl $(init_thread_union+THREAD_SIZE),%esp 15 movl $(init_thread_union+THREAD_SIZE),%esp
15 jmp xen_start_kernel 16 jmp xen_start_kernel
16.popsection 17
18 __FINIT
17 19
18.pushsection .bss.page_aligned 20.pushsection .bss.page_aligned
19 .align PAGE_SIZE_asm 21 .align PAGE_SIZE_asm
diff --git a/block/bsg.c b/block/bsg.c
index 69b0a9d33306..8917c5174dc2 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -279,6 +279,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
279 goto out; 279 goto out;
280 } 280 }
281 rq->next_rq = next_rq; 281 rq->next_rq = next_rq;
282 next_rq->cmd_type = rq->cmd_type;
282 283
283 dxferp = (void*)(unsigned long)hdr->din_xferp; 284 dxferp = (void*)(unsigned long)hdr->din_xferp;
284 ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); 285 ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 57fb1450560a..0ee9a8a4095e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_SCSI) += scsi/
38obj-$(CONFIG_ATA) += ata/ 38obj-$(CONFIG_ATA) += ata/
39obj-$(CONFIG_FUSION) += message/ 39obj-$(CONFIG_FUSION) += message/
40obj-$(CONFIG_FIREWIRE) += firewire/ 40obj-$(CONFIG_FIREWIRE) += firewire/
41obj-$(CONFIG_IEEE1394) += ieee1394/ 41obj-y += ieee1394/
42obj-$(CONFIG_UIO) += uio/ 42obj-$(CONFIG_UIO) += uio/
43obj-y += cdrom/ 43obj-y += cdrom/
44obj-y += auxdisplay/ 44obj-y += auxdisplay/
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2235f4e02d26..eb1f82f79153 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -357,6 +357,26 @@ int acpi_processor_resume(struct acpi_device * device)
357 return 0; 357 return 0;
358} 358}
359 359
360#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
361static int tsc_halts_in_c(int state)
362{
363 switch (boot_cpu_data.x86_vendor) {
364 case X86_VENDOR_AMD:
365 /*
366 * AMD Fam10h TSC will tick in all
367 * C/P/S0/S1 states when this bit is set.
368 */
369 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
370 return 0;
371 /*FALL THROUGH*/
372 case X86_VENDOR_INTEL:
373 /* Several cases known where TSC halts in C2 too */
374 default:
375 return state > ACPI_STATE_C1;
376 }
377}
378#endif
379
360#ifndef CONFIG_CPU_IDLE 380#ifndef CONFIG_CPU_IDLE
361static void acpi_processor_idle(void) 381static void acpi_processor_idle(void)
362{ 382{
@@ -516,7 +536,8 @@ static void acpi_processor_idle(void)
516 536
517#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC) 537#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
518 /* TSC halts in C2, so notify users */ 538 /* TSC halts in C2, so notify users */
519 mark_tsc_unstable("possible TSC halt in C2"); 539 if (tsc_halts_in_c(ACPI_STATE_C2))
540 mark_tsc_unstable("possible TSC halt in C2");
520#endif 541#endif
521 /* Compute time (ticks) that we were actually asleep */ 542 /* Compute time (ticks) that we were actually asleep */
522 sleep_ticks = ticks_elapsed(t1, t2); 543 sleep_ticks = ticks_elapsed(t1, t2);
@@ -534,6 +555,7 @@ static void acpi_processor_idle(void)
534 break; 555 break;
535 556
536 case ACPI_STATE_C3: 557 case ACPI_STATE_C3:
558 acpi_unlazy_tlb(smp_processor_id());
537 /* 559 /*
538 * Must be done before busmaster disable as we might 560 * Must be done before busmaster disable as we might
539 * need to access HPET ! 561 * need to access HPET !
@@ -579,7 +601,8 @@ static void acpi_processor_idle(void)
579 601
580#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC) 602#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
581 /* TSC halts in C3, so notify users */ 603 /* TSC halts in C3, so notify users */
582 mark_tsc_unstable("TSC halts in C3"); 604 if (tsc_halts_in_c(ACPI_STATE_C3))
605 mark_tsc_unstable("TSC halts in C3");
583#endif 606#endif
584 /* Compute time (ticks) that we were actually asleep */ 607 /* Compute time (ticks) that we were actually asleep */
585 sleep_ticks = ticks_elapsed(t1, t2); 608 sleep_ticks = ticks_elapsed(t1, t2);
@@ -1423,6 +1446,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
1423 return 0; 1446 return 0;
1424 } 1447 }
1425 1448
1449 acpi_unlazy_tlb(smp_processor_id());
1426 /* 1450 /*
1427 * Must be done before busmaster disable as we might need to 1451 * Must be done before busmaster disable as we might need to
1428 * access HPET ! 1452 * access HPET !
@@ -1443,7 +1467,8 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
1443 1467
1444#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC) 1468#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
1445 /* TSC could halt in idle, so notify users */ 1469 /* TSC could halt in idle, so notify users */
1446 mark_tsc_unstable("TSC halts in idle");; 1470 if (tsc_halts_in_c(cx->type))
1471 mark_tsc_unstable("TSC halts in idle");;
1447#endif 1472#endif
1448 sleep_ticks = ticks_elapsed(t1, t2); 1473 sleep_ticks = ticks_elapsed(t1, t2);
1449 1474
@@ -1554,7 +1579,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
1554 1579
1555#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC) 1580#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
1556 /* TSC could halt in idle, so notify users */ 1581 /* TSC could halt in idle, so notify users */
1557 mark_tsc_unstable("TSC halts in idle"); 1582 if (tsc_halts_in_c(ACPI_STATE_C3))
1583 mark_tsc_unstable("TSC halts in idle");
1558#endif 1584#endif
1559 sleep_ticks = ticks_elapsed(t1, t2); 1585 sleep_ticks = ticks_elapsed(t1, t2);
1560 /* Tell the scheduler how much we idled: */ 1586 /* Tell the scheduler how much we idled: */
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f484495b2ad1..055989e94799 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@ static struct kset *bus_kset;
163 163
164#ifdef CONFIG_HOTPLUG 164#ifdef CONFIG_HOTPLUG
165/* Manually detach a device from its associated driver. */ 165/* Manually detach a device from its associated driver. */
166static int driver_helper(struct device *dev, void *data)
167{
168 const char *name = data;
169
170 if (strcmp(name, dev->bus_id) == 0)
171 return 1;
172 return 0;
173}
174
175static ssize_t driver_unbind(struct device_driver *drv, 166static ssize_t driver_unbind(struct device_driver *drv,
176 const char *buf, size_t count) 167 const char *buf, size_t count)
177{ 168{
@@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
179 struct device *dev; 170 struct device *dev;
180 int err = -ENODEV; 171 int err = -ENODEV;
181 172
182 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 173 dev = bus_find_device_by_name(bus, NULL, buf);
183 if (dev && dev->driver == drv) { 174 if (dev && dev->driver == drv) {
184 if (dev->parent) /* Needed for USB */ 175 if (dev->parent) /* Needed for USB */
185 down(&dev->parent->sem); 176 down(&dev->parent->sem);
@@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv,
206 struct device *dev; 197 struct device *dev;
207 int err = -ENODEV; 198 int err = -ENODEV;
208 199
209 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 200 dev = bus_find_device_by_name(bus, NULL, buf);
210 if (dev && dev->driver == NULL) { 201 if (dev && dev->driver == NULL) {
211 if (dev->parent) /* Needed for USB */ 202 if (dev->parent) /* Needed for USB */
212 down(&dev->parent->sem); 203 down(&dev->parent->sem);
@@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
250{ 241{
251 struct device *dev; 242 struct device *dev;
252 243
253 dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); 244 dev = bus_find_device_by_name(bus, NULL, buf);
254 if (!dev) 245 if (!dev)
255 return -ENODEV; 246 return -ENODEV;
256 if (bus_rescan_devices_helper(dev, NULL) != 0) 247 if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus,
338} 329}
339EXPORT_SYMBOL_GPL(bus_find_device); 330EXPORT_SYMBOL_GPL(bus_find_device);
340 331
332static int match_name(struct device *dev, void *data)
333{
334 const char *name = data;
335
336 if (strcmp(name, dev->bus_id) == 0)
337 return 1;
338 return 0;
339}
340
341/**
342 * bus_find_device_by_name - device iterator for locating a particular device of a specific name
343 * @bus: bus type
344 * @start: Device to begin with
345 * @name: name of the device to match
346 *
347 * This is similar to the bus_find_device() function above, but it handles
348 * searching by a name automatically, no need to write another strcmp matching
349 * function.
350 */
351struct device *bus_find_device_by_name(struct bus_type *bus,
352 struct device *start, const char *name)
353{
354 return bus_find_device(bus, start, (void *)name, match_name);
355}
356EXPORT_SYMBOL_GPL(bus_find_device_by_name);
357
341static struct device_driver *next_driver(struct klist_iter *i) 358static struct device_driver *next_driver(struct klist_iter *i)
342{ 359{
343 struct klist_node *n = klist_next(i); 360 struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 59cf35894cfc..9d915376c313 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
149 if (error) 149 if (error)
150 return error; 150 return error;
151 151
152#ifdef CONFIG_SYSFS_DEPRECATED 152#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
153 /* let the block class directory show up in the root of sysfs */ 153 /* let the block class directory show up in the root of sysfs */
154 if (cls != &block_class) 154 if (cls != &block_class)
155 cls->subsys.kobj.kset = class_kset; 155 cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
863 * The callback should return 0 if the device doesn't match and non-zero 863 * The callback should return 0 if the device doesn't match and non-zero
864 * if it does. If the callback returns non-zero, this function will 864 * if it does. If the callback returns non-zero, this function will
865 * return to the caller and not iterate over any more devices. 865 * return to the caller and not iterate over any more devices.
866 866 *
867 * Note, you will need to drop the reference with put_device() after use. 867 * Note, you will need to drop the reference with put_device() after use.
868 * 868 *
869 * We hold class->sem in this function, so it can not be 869 * We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c
index edf3bbeb8d6a..b1727876182c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
27int (*platform_notify)(struct device *dev) = NULL; 27int (*platform_notify)(struct device *dev) = NULL;
28int (*platform_notify_remove)(struct device *dev) = NULL; 28int (*platform_notify_remove)(struct device *dev) = NULL;
29 29
30/* 30#ifdef CONFIG_BLOCK
31 * sysfs bindings for devices. 31static inline int device_is_not_partition(struct device *dev)
32 */ 32{
33 return !(dev->type == &part_type);
34}
35#else
36static inline int device_is_not_partition(struct device *dev)
37{
38 return 1;
39}
40#endif
33 41
34/** 42/**
35 * dev_driver_string - Return a device's driver name, if at all possible 43 * dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev)
652#ifdef CONFIG_SYSFS_DEPRECATED 660#ifdef CONFIG_SYSFS_DEPRECATED
653 /* stacked class devices need a symlink in the class directory */ 661 /* stacked class devices need a symlink in the class directory */
654 if (dev->kobj.parent != &dev->class->subsys.kobj && 662 if (dev->kobj.parent != &dev->class->subsys.kobj &&
655 dev->type != &part_type) { 663 device_is_not_partition(dev)) {
656 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, 664 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
657 dev->bus_id); 665 dev->bus_id);
658 if (error) 666 if (error)
659 goto out_subsys; 667 goto out_subsys;
660 } 668 }
661 669
662 if (dev->parent && dev->type != &part_type) { 670 if (dev->parent && device_is_not_partition(dev)) {
663 struct device *parent = dev->parent; 671 struct device *parent = dev->parent;
664 char *class_name; 672 char *class_name;
665 673
@@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev)
688 return 0; 696 return 0;
689 697
690out_device: 698out_device:
691 if (dev->parent && dev->type != &part_type) 699 if (dev->parent && device_is_not_partition(dev))
692 sysfs_remove_link(&dev->kobj, "device"); 700 sysfs_remove_link(&dev->kobj, "device");
693out_busid: 701out_busid:
694 if (dev->kobj.parent != &dev->class->subsys.kobj && 702 if (dev->kobj.parent != &dev->class->subsys.kobj &&
695 dev->type != &part_type) 703 device_is_not_partition(dev))
696 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 704 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
697#else 705#else
698 /* link in the class directory pointing to the device */ 706 /* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@ out_busid:
701 if (error) 709 if (error)
702 goto out_subsys; 710 goto out_subsys;
703 711
704 if (dev->parent && dev->type != &part_type) { 712 if (dev->parent && device_is_not_partition(dev)) {
705 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, 713 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
706 "device"); 714 "device");
707 if (error) 715 if (error)
@@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev)
725 return; 733 return;
726 734
727#ifdef CONFIG_SYSFS_DEPRECATED 735#ifdef CONFIG_SYSFS_DEPRECATED
728 if (dev->parent && dev->type != &part_type) { 736 if (dev->parent && device_is_not_partition(dev)) {
729 char *class_name; 737 char *class_name;
730 738
731 class_name = make_class_name(dev->class->name, &dev->kobj); 739 class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev)
737 } 745 }
738 746
739 if (dev->kobj.parent != &dev->class->subsys.kobj && 747 if (dev->kobj.parent != &dev->class->subsys.kobj &&
740 dev->type != &part_type) 748 device_is_not_partition(dev))
741 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 749 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
742#else 750#else
743 if (dev->parent && dev->type != &part_type) 751 if (dev->parent && device_is_not_partition(dev))
744 sysfs_remove_link(&dev->kobj, "device"); 752 sysfs_remove_link(&dev->kobj, "device");
745 753
746 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); 754 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index aa5ddb716ffb..1ffb381130c3 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -145,7 +145,6 @@ static void *m1541_alloc_page(struct agp_bridge_data *bridge)
145 void *addr = agp_generic_alloc_page(agp_bridge); 145 void *addr = agp_generic_alloc_page(agp_bridge);
146 u32 temp; 146 u32 temp;
147 147
148 global_flush_tlb();
149 if (!addr) 148 if (!addr)
150 return NULL; 149 return NULL;
151 150
@@ -162,7 +161,6 @@ static void ali_destroy_page(void * addr, int flags)
162 if (flags & AGP_PAGE_DESTROY_UNMAP) { 161 if (flags & AGP_PAGE_DESTROY_UNMAP) {
163 global_cache_flush(); /* is this really needed? --hch */ 162 global_cache_flush(); /* is this really needed? --hch */
164 agp_generic_destroy_page(addr, flags); 163 agp_generic_destroy_page(addr, flags);
165 global_flush_tlb();
166 } else 164 } else
167 agp_generic_destroy_page(addr, flags); 165 agp_generic_destroy_page(addr, flags);
168 } 166 }
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 832ded20fe70..2720882e66fe 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -147,7 +147,6 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
147 printk(KERN_ERR PFX "unable to get memory for scratch page.\n"); 147 printk(KERN_ERR PFX "unable to get memory for scratch page.\n");
148 return -ENOMEM; 148 return -ENOMEM;
149 } 149 }
150 flush_agp_mappings();
151 150
152 bridge->scratch_page_real = virt_to_gart(addr); 151 bridge->scratch_page_real = virt_to_gart(addr);
153 bridge->scratch_page = 152 bridge->scratch_page =
@@ -191,7 +190,6 @@ err_out:
191 if (bridge->driver->needs_scratch_page) { 190 if (bridge->driver->needs_scratch_page) {
192 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real), 191 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
193 AGP_PAGE_DESTROY_UNMAP); 192 AGP_PAGE_DESTROY_UNMAP);
194 flush_agp_mappings();
195 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real), 193 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
196 AGP_PAGE_DESTROY_FREE); 194 AGP_PAGE_DESTROY_FREE);
197 } 195 }
@@ -219,7 +217,6 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge)
219 bridge->driver->needs_scratch_page) { 217 bridge->driver->needs_scratch_page) {
220 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real), 218 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
221 AGP_PAGE_DESTROY_UNMAP); 219 AGP_PAGE_DESTROY_UNMAP);
222 flush_agp_mappings();
223 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real), 220 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
224 AGP_PAGE_DESTROY_FREE); 221 AGP_PAGE_DESTROY_FREE);
225 } 222 }
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 64b2f6d7059d..1a4674ce0c71 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -197,7 +197,6 @@ void agp_free_memory(struct agp_memory *curr)
197 for (i = 0; i < curr->page_count; i++) { 197 for (i = 0; i < curr->page_count; i++) {
198 curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_UNMAP); 198 curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_UNMAP);
199 } 199 }
200 flush_agp_mappings();
201 for (i = 0; i < curr->page_count; i++) { 200 for (i = 0; i < curr->page_count; i++) {
202 curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_FREE); 201 curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_FREE);
203 } 202 }
@@ -267,8 +266,6 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge,
267 } 266 }
268 new->bridge = bridge; 267 new->bridge = bridge;
269 268
270 flush_agp_mappings();
271
272 return new; 269 return new;
273} 270}
274EXPORT_SYMBOL(agp_allocate_memory); 271EXPORT_SYMBOL(agp_allocate_memory);
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index e72a83e2bad5..76f581c85a7d 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -527,7 +527,6 @@ static void *i460_alloc_page (struct agp_bridge_data *bridge)
527 527
528 if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) { 528 if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
529 page = agp_generic_alloc_page(agp_bridge); 529 page = agp_generic_alloc_page(agp_bridge);
530 global_flush_tlb();
531 } else 530 } else
532 /* Returning NULL would cause problems */ 531 /* Returning NULL would cause problems */
533 /* AK: really dubious code. */ 532 /* AK: really dubious code. */
@@ -539,7 +538,6 @@ static void i460_destroy_page (void *page, int flags)
539{ 538{
540 if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) { 539 if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
541 agp_generic_destroy_page(page, flags); 540 agp_generic_destroy_page(page, flags);
542 global_flush_tlb();
543 } 541 }
544} 542}
545 543
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 03eac1eb8e0f..189efb6ef970 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -210,13 +210,11 @@ static void *i8xx_alloc_pages(void)
210 if (page == NULL) 210 if (page == NULL)
211 return NULL; 211 return NULL;
212 212
213 if (change_page_attr(page, 4, PAGE_KERNEL_NOCACHE) < 0) { 213 if (set_pages_uc(page, 4) < 0) {
214 change_page_attr(page, 4, PAGE_KERNEL); 214 set_pages_wb(page, 4);
215 global_flush_tlb();
216 __free_pages(page, 2); 215 __free_pages(page, 2);
217 return NULL; 216 return NULL;
218 } 217 }
219 global_flush_tlb();
220 get_page(page); 218 get_page(page);
221 atomic_inc(&agp_bridge->current_memory_agp); 219 atomic_inc(&agp_bridge->current_memory_agp);
222 return page_address(page); 220 return page_address(page);
@@ -230,8 +228,7 @@ static void i8xx_destroy_pages(void *addr)
230 return; 228 return;
231 229
232 page = virt_to_page(addr); 230 page = virt_to_page(addr);
233 change_page_attr(page, 4, PAGE_KERNEL); 231 set_pages_wb(page, 4);
234 global_flush_tlb();
235 put_page(page); 232 put_page(page);
236 __free_pages(page, 2); 233 __free_pages(page, 2);
237 atomic_dec(&agp_bridge->current_memory_agp); 234 atomic_dec(&agp_bridge->current_memory_agp);
@@ -341,7 +338,6 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
341 338
342 switch (pg_count) { 339 switch (pg_count) {
343 case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge); 340 case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge);
344 global_flush_tlb();
345 break; 341 break;
346 case 4: 342 case 4:
347 /* kludge to get 4 physical pages for ARGB cursor */ 343 /* kludge to get 4 physical pages for ARGB cursor */
@@ -404,7 +400,6 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
404 else { 400 else {
405 agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]), 401 agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
406 AGP_PAGE_DESTROY_UNMAP); 402 AGP_PAGE_DESTROY_UNMAP);
407 global_flush_tlb();
408 agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]), 403 agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
409 AGP_PAGE_DESTROY_FREE); 404 AGP_PAGE_DESTROY_FREE);
410 } 405 }
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 4c16778e3f84..465ad35ed38f 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -600,63 +600,6 @@ static int hpet_is_known(struct hpet_data *hdp)
600 return 0; 600 return 0;
601} 601}
602 602
603EXPORT_SYMBOL(hpet_alloc);
604EXPORT_SYMBOL(hpet_register);
605EXPORT_SYMBOL(hpet_unregister);
606EXPORT_SYMBOL(hpet_control);
607
608int hpet_register(struct hpet_task *tp, int periodic)
609{
610 unsigned int i;
611 u64 mask;
612 struct hpet_timer __iomem *timer;
613 struct hpet_dev *devp;
614 struct hpets *hpetp;
615
616 switch (periodic) {
617 case 1:
618 mask = Tn_PER_INT_CAP_MASK;
619 break;
620 case 0:
621 mask = 0;
622 break;
623 default:
624 return -EINVAL;
625 }
626
627 tp->ht_opaque = NULL;
628
629 spin_lock_irq(&hpet_task_lock);
630 spin_lock(&hpet_lock);
631
632 for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next)
633 for (timer = hpetp->hp_hpet->hpet_timers, i = 0;
634 i < hpetp->hp_ntimer; i++, timer++) {
635 if ((readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK)
636 != mask)
637 continue;
638
639 devp = &hpetp->hp_dev[i];
640
641 if (devp->hd_flags & HPET_OPEN || devp->hd_task) {
642 devp = NULL;
643 continue;
644 }
645
646 tp->ht_opaque = devp;
647 devp->hd_task = tp;
648 break;
649 }
650
651 spin_unlock(&hpet_lock);
652 spin_unlock_irq(&hpet_task_lock);
653
654 if (tp->ht_opaque)
655 return 0;
656 else
657 return -EBUSY;
658}
659
660static inline int hpet_tpcheck(struct hpet_task *tp) 603static inline int hpet_tpcheck(struct hpet_task *tp)
661{ 604{
662 struct hpet_dev *devp; 605 struct hpet_dev *devp;
@@ -706,24 +649,6 @@ int hpet_unregister(struct hpet_task *tp)
706 return 0; 649 return 0;
707} 650}
708 651
709int hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg)
710{
711 struct hpet_dev *devp;
712 int err;
713
714 if ((err = hpet_tpcheck(tp)))
715 return err;
716
717 spin_lock_irq(&hpet_lock);
718 devp = tp->ht_opaque;
719 if (devp->hd_task != tp) {
720 spin_unlock_irq(&hpet_lock);
721 return -ENXIO;
722 }
723 spin_unlock_irq(&hpet_lock);
724 return hpet_ioctl_common(devp, cmd, arg, 1);
725}
726
727static ctl_table hpet_table[] = { 652static ctl_table hpet_table[] = {
728 { 653 {
729 .ctl_name = CTL_UNNUMBERED, 654 .ctl_name = CTL_UNNUMBERED,
@@ -806,14 +731,14 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
806 731
807int hpet_alloc(struct hpet_data *hdp) 732int hpet_alloc(struct hpet_data *hdp)
808{ 733{
809 u64 cap, mcfg; 734 u64 cap, mcfg, hpet_config;
810 struct hpet_dev *devp; 735 struct hpet_dev *devp;
811 u32 i, ntimer; 736 u32 i, ntimer, irq;
812 struct hpets *hpetp; 737 struct hpets *hpetp;
813 size_t siz; 738 size_t siz;
814 struct hpet __iomem *hpet; 739 struct hpet __iomem *hpet;
815 static struct hpets *last = NULL; 740 static struct hpets *last = NULL;
816 unsigned long period; 741 unsigned long period, irq_bitmap;
817 unsigned long long temp; 742 unsigned long long temp;
818 743
819 /* 744 /*
@@ -840,11 +765,47 @@ int hpet_alloc(struct hpet_data *hdp)
840 hpetp->hp_hpet_phys = hdp->hd_phys_address; 765 hpetp->hp_hpet_phys = hdp->hd_phys_address;
841 766
842 hpetp->hp_ntimer = hdp->hd_nirqs; 767 hpetp->hp_ntimer = hdp->hd_nirqs;
768 hpet = hpetp->hp_hpet;
843 769
844 for (i = 0; i < hdp->hd_nirqs; i++) 770 /* Assign IRQs statically for legacy devices */
845 hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i]; 771 hpetp->hp_dev[0].hd_hdwirq = hdp->hd_irq[0];
772 hpetp->hp_dev[1].hd_hdwirq = hdp->hd_irq[1];
846 773
847 hpet = hpetp->hp_hpet; 774 /* Assign IRQs dynamically for the others */
775 for (i = 2, devp = &hpetp->hp_dev[2]; i < hdp->hd_nirqs; i++, devp++) {
776 struct hpet_timer __iomem *timer;
777
778 timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
779
780 /* Check if there's already an IRQ assigned to the timer */
781 if (hdp->hd_irq[i]) {
782 hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
783 continue;
784 }
785
786 hpet_config = readq(&timer->hpet_config);
787 irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK)
788 >> Tn_INT_ROUTE_CAP_SHIFT;
789 if (!irq_bitmap)
790 irq = 0; /* No valid IRQ Assignable */
791 else {
792 irq = find_first_bit(&irq_bitmap, 32);
793 do {
794 hpet_config |= irq << Tn_INT_ROUTE_CNF_SHIFT;
795 writeq(hpet_config, &timer->hpet_config);
796
797 /*
798 * Verify whether we have written a valid
799 * IRQ number by reading it back again
800 */
801 hpet_config = readq(&timer->hpet_config);
802 if (irq == (hpet_config & Tn_INT_ROUTE_CNF_MASK)
803 >> Tn_INT_ROUTE_CNF_SHIFT)
804 break; /* Success */
805 } while ((irq = (find_next_bit(&irq_bitmap, 32, irq))));
806 }
807 hpetp->hp_dev[i].hd_hdwirq = irq;
808 }
848 809
849 cap = readq(&hpet->hpet_cap); 810 cap = readq(&hpet->hpet_cap);
850 811
@@ -875,7 +836,8 @@ int hpet_alloc(struct hpet_data *hdp)
875 hpetp->hp_which, hdp->hd_phys_address, 836 hpetp->hp_which, hdp->hd_phys_address,
876 hpetp->hp_ntimer > 1 ? "s" : ""); 837 hpetp->hp_ntimer > 1 ? "s" : "");
877 for (i = 0; i < hpetp->hp_ntimer; i++) 838 for (i = 0; i < hpetp->hp_ntimer; i++)
878 printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]); 839 printk("%s %d", i > 0 ? "," : "",
840 hpetp->hp_dev[i].hd_hdwirq);
879 printk("\n"); 841 printk("\n");
880 842
881 printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n", 843 printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n",
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 0c66b802736a..78b151c4d20f 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Real Time Clock interface for Linux 2 * Real Time Clock interface for Linux
3 * 3 *
4 * Copyright (C) 1996 Paul Gortmaker 4 * Copyright (C) 1996 Paul Gortmaker
5 * 5 *
@@ -17,7 +17,7 @@
17 * has been received. If a RTC interrupt has already happened, 17 * has been received. If a RTC interrupt has already happened,
18 * it will output an unsigned long and then block. The output value 18 * it will output an unsigned long and then block. The output value
19 * contains the interrupt status in the low byte and the number of 19 * contains the interrupt status in the low byte and the number of
20 * interrupts since the last read in the remaining high bytes. The 20 * interrupts since the last read in the remaining high bytes. The
21 * /dev/rtc interface can also be used with the select(2) call. 21 * /dev/rtc interface can also be used with the select(2) call.
22 * 22 *
23 * This program is free software; you can redistribute it and/or 23 * This program is free software; you can redistribute it and/or
@@ -104,12 +104,14 @@ static int rtc_has_irq = 1;
104 104
105#ifndef CONFIG_HPET_EMULATE_RTC 105#ifndef CONFIG_HPET_EMULATE_RTC
106#define is_hpet_enabled() 0 106#define is_hpet_enabled() 0
107#define hpet_set_alarm_time(hrs, min, sec) 0 107#define hpet_set_alarm_time(hrs, min, sec) 0
108#define hpet_set_periodic_freq(arg) 0 108#define hpet_set_periodic_freq(arg) 0
109#define hpet_mask_rtc_irq_bit(arg) 0 109#define hpet_mask_rtc_irq_bit(arg) 0
110#define hpet_set_rtc_irq_bit(arg) 0 110#define hpet_set_rtc_irq_bit(arg) 0
111#define hpet_rtc_timer_init() do { } while (0) 111#define hpet_rtc_timer_init() do { } while (0)
112#define hpet_rtc_dropped_irq() 0 112#define hpet_rtc_dropped_irq() 0
113#define hpet_register_irq_handler(h) 0
114#define hpet_unregister_irq_handler(h) 0
113#ifdef RTC_IRQ 115#ifdef RTC_IRQ
114static irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) 116static irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
115{ 117{
@@ -147,7 +149,7 @@ static int rtc_ioctl(struct inode *inode, struct file *file,
147static unsigned int rtc_poll(struct file *file, poll_table *wait); 149static unsigned int rtc_poll(struct file *file, poll_table *wait);
148#endif 150#endif
149 151
150static void get_rtc_alm_time (struct rtc_time *alm_tm); 152static void get_rtc_alm_time(struct rtc_time *alm_tm);
151#ifdef RTC_IRQ 153#ifdef RTC_IRQ
152static void set_rtc_irq_bit_locked(unsigned char bit); 154static void set_rtc_irq_bit_locked(unsigned char bit);
153static void mask_rtc_irq_bit_locked(unsigned char bit); 155static void mask_rtc_irq_bit_locked(unsigned char bit);
@@ -185,9 +187,9 @@ static int rtc_proc_open(struct inode *inode, struct file *file);
185 * rtc_status but before mod_timer is called, which would then reenable the 187 * rtc_status but before mod_timer is called, which would then reenable the
186 * timer (but you would need to have an awful timing before you'd trip on it) 188 * timer (but you would need to have an awful timing before you'd trip on it)
187 */ 189 */
188static unsigned long rtc_status = 0; /* bitmapped status byte. */ 190static unsigned long rtc_status; /* bitmapped status byte. */
189static unsigned long rtc_freq = 0; /* Current periodic IRQ rate */ 191static unsigned long rtc_freq; /* Current periodic IRQ rate */
190static unsigned long rtc_irq_data = 0; /* our output to the world */ 192static unsigned long rtc_irq_data; /* our output to the world */
191static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */ 193static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
192 194
193#ifdef RTC_IRQ 195#ifdef RTC_IRQ
@@ -195,7 +197,7 @@ static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
195 * rtc_task_lock nests inside rtc_lock. 197 * rtc_task_lock nests inside rtc_lock.
196 */ 198 */
197static DEFINE_SPINLOCK(rtc_task_lock); 199static DEFINE_SPINLOCK(rtc_task_lock);
198static rtc_task_t *rtc_callback = NULL; 200static rtc_task_t *rtc_callback;
199#endif 201#endif
200 202
201/* 203/*
@@ -205,7 +207,7 @@ static rtc_task_t *rtc_callback = NULL;
205 207
206static unsigned long epoch = 1900; /* year corresponding to 0x00 */ 208static unsigned long epoch = 1900; /* year corresponding to 0x00 */
207 209
208static const unsigned char days_in_mo[] = 210static const unsigned char days_in_mo[] =
209{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; 211{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
210 212
211/* 213/*
@@ -242,7 +244,7 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
242 * the last read in the remainder of rtc_irq_data. 244 * the last read in the remainder of rtc_irq_data.
243 */ 245 */
244 246
245 spin_lock (&rtc_lock); 247 spin_lock(&rtc_lock);
246 rtc_irq_data += 0x100; 248 rtc_irq_data += 0x100;
247 rtc_irq_data &= ~0xff; 249 rtc_irq_data &= ~0xff;
248 if (is_hpet_enabled()) { 250 if (is_hpet_enabled()) {
@@ -259,16 +261,16 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
259 if (rtc_status & RTC_TIMER_ON) 261 if (rtc_status & RTC_TIMER_ON)
260 mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); 262 mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
261 263
262 spin_unlock (&rtc_lock); 264 spin_unlock(&rtc_lock);
263 265
264 /* Now do the rest of the actions */ 266 /* Now do the rest of the actions */
265 spin_lock(&rtc_task_lock); 267 spin_lock(&rtc_task_lock);
266 if (rtc_callback) 268 if (rtc_callback)
267 rtc_callback->func(rtc_callback->private_data); 269 rtc_callback->func(rtc_callback->private_data);
268 spin_unlock(&rtc_task_lock); 270 spin_unlock(&rtc_task_lock);
269 wake_up_interruptible(&rtc_wait); 271 wake_up_interruptible(&rtc_wait);
270 272
271 kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); 273 kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
272 274
273 return IRQ_HANDLED; 275 return IRQ_HANDLED;
274} 276}
@@ -335,7 +337,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
335 DECLARE_WAITQUEUE(wait, current); 337 DECLARE_WAITQUEUE(wait, current);
336 unsigned long data; 338 unsigned long data;
337 ssize_t retval; 339 ssize_t retval;
338 340
339 if (rtc_has_irq == 0) 341 if (rtc_has_irq == 0)
340 return -EIO; 342 return -EIO;
341 343
@@ -358,11 +360,11 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
358 * confusing. And no, xchg() is not the answer. */ 360 * confusing. And no, xchg() is not the answer. */
359 361
360 __set_current_state(TASK_INTERRUPTIBLE); 362 __set_current_state(TASK_INTERRUPTIBLE);
361 363
362 spin_lock_irq (&rtc_lock); 364 spin_lock_irq(&rtc_lock);
363 data = rtc_irq_data; 365 data = rtc_irq_data;
364 rtc_irq_data = 0; 366 rtc_irq_data = 0;
365 spin_unlock_irq (&rtc_lock); 367 spin_unlock_irq(&rtc_lock);
366 368
367 if (data != 0) 369 if (data != 0)
368 break; 370 break;
@@ -378,10 +380,13 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
378 schedule(); 380 schedule();
379 } while (1); 381 } while (1);
380 382
381 if (count == sizeof(unsigned int)) 383 if (count == sizeof(unsigned int)) {
382 retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); 384 retval = put_user(data,
383 else 385 (unsigned int __user *)buf) ?: sizeof(int);
384 retval = put_user(data, (unsigned long __user *)buf) ?: sizeof(long); 386 } else {
387 retval = put_user(data,
388 (unsigned long __user *)buf) ?: sizeof(long);
389 }
385 if (!retval) 390 if (!retval)
386 retval = count; 391 retval = count;
387 out: 392 out:
@@ -394,7 +399,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
394 399
395static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel) 400static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
396{ 401{
397 struct rtc_time wtime; 402 struct rtc_time wtime;
398 403
399#ifdef RTC_IRQ 404#ifdef RTC_IRQ
400 if (rtc_has_irq == 0) { 405 if (rtc_has_irq == 0) {
@@ -426,35 +431,41 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
426 } 431 }
427 case RTC_PIE_OFF: /* Mask periodic int. enab. bit */ 432 case RTC_PIE_OFF: /* Mask periodic int. enab. bit */
428 { 433 {
429 unsigned long flags; /* can be called from isr via rtc_control() */ 434 /* can be called from isr via rtc_control() */
430 spin_lock_irqsave (&rtc_lock, flags); 435 unsigned long flags;
436
437 spin_lock_irqsave(&rtc_lock, flags);
431 mask_rtc_irq_bit_locked(RTC_PIE); 438 mask_rtc_irq_bit_locked(RTC_PIE);
432 if (rtc_status & RTC_TIMER_ON) { 439 if (rtc_status & RTC_TIMER_ON) {
433 rtc_status &= ~RTC_TIMER_ON; 440 rtc_status &= ~RTC_TIMER_ON;
434 del_timer(&rtc_irq_timer); 441 del_timer(&rtc_irq_timer);
435 } 442 }
436 spin_unlock_irqrestore (&rtc_lock, flags); 443 spin_unlock_irqrestore(&rtc_lock, flags);
444
437 return 0; 445 return 0;
438 } 446 }
439 case RTC_PIE_ON: /* Allow periodic ints */ 447 case RTC_PIE_ON: /* Allow periodic ints */
440 { 448 {
441 unsigned long flags; /* can be called from isr via rtc_control() */ 449 /* can be called from isr via rtc_control() */
450 unsigned long flags;
451
442 /* 452 /*
443 * We don't really want Joe User enabling more 453 * We don't really want Joe User enabling more
444 * than 64Hz of interrupts on a multi-user machine. 454 * than 64Hz of interrupts on a multi-user machine.
445 */ 455 */
446 if (!kernel && (rtc_freq > rtc_max_user_freq) && 456 if (!kernel && (rtc_freq > rtc_max_user_freq) &&
447 (!capable(CAP_SYS_RESOURCE))) 457 (!capable(CAP_SYS_RESOURCE)))
448 return -EACCES; 458 return -EACCES;
449 459
450 spin_lock_irqsave (&rtc_lock, flags); 460 spin_lock_irqsave(&rtc_lock, flags);
451 if (!(rtc_status & RTC_TIMER_ON)) { 461 if (!(rtc_status & RTC_TIMER_ON)) {
452 mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 462 mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq +
453 2*HZ/100); 463 2*HZ/100);
454 rtc_status |= RTC_TIMER_ON; 464 rtc_status |= RTC_TIMER_ON;
455 } 465 }
456 set_rtc_irq_bit_locked(RTC_PIE); 466 set_rtc_irq_bit_locked(RTC_PIE);
457 spin_unlock_irqrestore (&rtc_lock, flags); 467 spin_unlock_irqrestore(&rtc_lock, flags);
468
458 return 0; 469 return 0;
459 } 470 }
460 case RTC_UIE_OFF: /* Mask ints from RTC updates. */ 471 case RTC_UIE_OFF: /* Mask ints from RTC updates. */
@@ -477,7 +488,7 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
477 */ 488 */
478 memset(&wtime, 0, sizeof(struct rtc_time)); 489 memset(&wtime, 0, sizeof(struct rtc_time));
479 get_rtc_alm_time(&wtime); 490 get_rtc_alm_time(&wtime);
480 break; 491 break;
481 } 492 }
482 case RTC_ALM_SET: /* Store a time into the alarm */ 493 case RTC_ALM_SET: /* Store a time into the alarm */
483 { 494 {
@@ -505,16 +516,21 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
505 */ 516 */
506 } 517 }
507 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || 518 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ||
508 RTC_ALWAYS_BCD) 519 RTC_ALWAYS_BCD) {
509 { 520 if (sec < 60)
510 if (sec < 60) BIN_TO_BCD(sec); 521 BIN_TO_BCD(sec);
511 else sec = 0xff; 522 else
512 523 sec = 0xff;
513 if (min < 60) BIN_TO_BCD(min); 524
514 else min = 0xff; 525 if (min < 60)
515 526 BIN_TO_BCD(min);
516 if (hrs < 24) BIN_TO_BCD(hrs); 527 else
517 else hrs = 0xff; 528 min = 0xff;
529
530 if (hrs < 24)
531 BIN_TO_BCD(hrs);
532 else
533 hrs = 0xff;
518 } 534 }
519 CMOS_WRITE(hrs, RTC_HOURS_ALARM); 535 CMOS_WRITE(hrs, RTC_HOURS_ALARM);
520 CMOS_WRITE(min, RTC_MINUTES_ALARM); 536 CMOS_WRITE(min, RTC_MINUTES_ALARM);
@@ -563,11 +579,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
563 579
564 if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr))) 580 if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr)))
565 return -EINVAL; 581 return -EINVAL;
566 582
567 if ((hrs >= 24) || (min >= 60) || (sec >= 60)) 583 if ((hrs >= 24) || (min >= 60) || (sec >= 60))
568 return -EINVAL; 584 return -EINVAL;
569 585
570 if ((yrs -= epoch) > 255) /* They are unsigned */ 586 yrs -= epoch;
587 if (yrs > 255) /* They are unsigned */
571 return -EINVAL; 588 return -EINVAL;
572 589
573 spin_lock_irq(&rtc_lock); 590 spin_lock_irq(&rtc_lock);
@@ -635,9 +652,10 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
635 { 652 {
636 int tmp = 0; 653 int tmp = 0;
637 unsigned char val; 654 unsigned char val;
638 unsigned long flags; /* can be called from isr via rtc_control() */ 655 /* can be called from isr via rtc_control() */
656 unsigned long flags;
639 657
640 /* 658 /*
641 * The max we can do is 8192Hz. 659 * The max we can do is 8192Hz.
642 */ 660 */
643 if ((arg < 2) || (arg > 8192)) 661 if ((arg < 2) || (arg > 8192))
@@ -646,7 +664,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
646 * We don't really want Joe User generating more 664 * We don't really want Joe User generating more
647 * than 64Hz of interrupts on a multi-user machine. 665 * than 64Hz of interrupts on a multi-user machine.
648 */ 666 */
649 if (!kernel && (arg > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE))) 667 if (!kernel && (arg > rtc_max_user_freq) &&
668 !capable(CAP_SYS_RESOURCE))
650 return -EACCES; 669 return -EACCES;
651 670
652 while (arg > (1<<tmp)) 671 while (arg > (1<<tmp))
@@ -674,11 +693,11 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
674#endif 693#endif
675 case RTC_EPOCH_READ: /* Read the epoch. */ 694 case RTC_EPOCH_READ: /* Read the epoch. */
676 { 695 {
677 return put_user (epoch, (unsigned long __user *)arg); 696 return put_user(epoch, (unsigned long __user *)arg);
678 } 697 }
679 case RTC_EPOCH_SET: /* Set the epoch. */ 698 case RTC_EPOCH_SET: /* Set the epoch. */
680 { 699 {
681 /* 700 /*
682 * There were no RTC clocks before 1900. 701 * There were no RTC clocks before 1900.
683 */ 702 */
684 if (arg < 1900) 703 if (arg < 1900)
@@ -693,7 +712,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
693 default: 712 default:
694 return -ENOTTY; 713 return -ENOTTY;
695 } 714 }
696 return copy_to_user((void __user *)arg, &wtime, sizeof wtime) ? -EFAULT : 0; 715 return copy_to_user((void __user *)arg,
716 &wtime, sizeof wtime) ? -EFAULT : 0;
697} 717}
698 718
699static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 719static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
@@ -712,26 +732,25 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
712 * needed here. Or anywhere else in this driver. */ 732 * needed here. Or anywhere else in this driver. */
713static int rtc_open(struct inode *inode, struct file *file) 733static int rtc_open(struct inode *inode, struct file *file)
714{ 734{
715 spin_lock_irq (&rtc_lock); 735 spin_lock_irq(&rtc_lock);
716 736
717 if(rtc_status & RTC_IS_OPEN) 737 if (rtc_status & RTC_IS_OPEN)
718 goto out_busy; 738 goto out_busy;
719 739
720 rtc_status |= RTC_IS_OPEN; 740 rtc_status |= RTC_IS_OPEN;
721 741
722 rtc_irq_data = 0; 742 rtc_irq_data = 0;
723 spin_unlock_irq (&rtc_lock); 743 spin_unlock_irq(&rtc_lock);
724 return 0; 744 return 0;
725 745
726out_busy: 746out_busy:
727 spin_unlock_irq (&rtc_lock); 747 spin_unlock_irq(&rtc_lock);
728 return -EBUSY; 748 return -EBUSY;
729} 749}
730 750
731static int rtc_fasync (int fd, struct file *filp, int on) 751static int rtc_fasync(int fd, struct file *filp, int on)
732
733{ 752{
734 return fasync_helper (fd, filp, on, &rtc_async_queue); 753 return fasync_helper(fd, filp, on, &rtc_async_queue);
735} 754}
736 755
737static int rtc_release(struct inode *inode, struct file *file) 756static int rtc_release(struct inode *inode, struct file *file)
@@ -762,16 +781,16 @@ static int rtc_release(struct inode *inode, struct file *file)
762 } 781 }
763 spin_unlock_irq(&rtc_lock); 782 spin_unlock_irq(&rtc_lock);
764 783
765 if (file->f_flags & FASYNC) { 784 if (file->f_flags & FASYNC)
766 rtc_fasync (-1, file, 0); 785 rtc_fasync(-1, file, 0);
767 }
768no_irq: 786no_irq:
769#endif 787#endif
770 788
771 spin_lock_irq (&rtc_lock); 789 spin_lock_irq(&rtc_lock);
772 rtc_irq_data = 0; 790 rtc_irq_data = 0;
773 rtc_status &= ~RTC_IS_OPEN; 791 rtc_status &= ~RTC_IS_OPEN;
774 spin_unlock_irq (&rtc_lock); 792 spin_unlock_irq(&rtc_lock);
793
775 return 0; 794 return 0;
776} 795}
777 796
@@ -786,9 +805,9 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
786 805
787 poll_wait(file, &rtc_wait, wait); 806 poll_wait(file, &rtc_wait, wait);
788 807
789 spin_lock_irq (&rtc_lock); 808 spin_lock_irq(&rtc_lock);
790 l = rtc_irq_data; 809 l = rtc_irq_data;
791 spin_unlock_irq (&rtc_lock); 810 spin_unlock_irq(&rtc_lock);
792 811
793 if (l != 0) 812 if (l != 0)
794 return POLLIN | POLLRDNORM; 813 return POLLIN | POLLRDNORM;
@@ -796,14 +815,6 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
796} 815}
797#endif 816#endif
798 817
799/*
800 * exported stuffs
801 */
802
803EXPORT_SYMBOL(rtc_register);
804EXPORT_SYMBOL(rtc_unregister);
805EXPORT_SYMBOL(rtc_control);
806
807int rtc_register(rtc_task_t *task) 818int rtc_register(rtc_task_t *task)
808{ 819{
809#ifndef RTC_IRQ 820#ifndef RTC_IRQ
@@ -829,6 +840,7 @@ int rtc_register(rtc_task_t *task)
829 return 0; 840 return 0;
830#endif 841#endif
831} 842}
843EXPORT_SYMBOL(rtc_register);
832 844
833int rtc_unregister(rtc_task_t *task) 845int rtc_unregister(rtc_task_t *task)
834{ 846{
@@ -845,7 +857,7 @@ int rtc_unregister(rtc_task_t *task)
845 return -ENXIO; 857 return -ENXIO;
846 } 858 }
847 rtc_callback = NULL; 859 rtc_callback = NULL;
848 860
849 /* disable controls */ 861 /* disable controls */
850 if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) { 862 if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
851 tmp = CMOS_READ(RTC_CONTROL); 863 tmp = CMOS_READ(RTC_CONTROL);
@@ -865,6 +877,7 @@ int rtc_unregister(rtc_task_t *task)
865 return 0; 877 return 0;
866#endif 878#endif
867} 879}
880EXPORT_SYMBOL(rtc_unregister);
868 881
869int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg) 882int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
870{ 883{
@@ -883,7 +896,7 @@ int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
883 return rtc_do_ioctl(cmd, arg, 1); 896 return rtc_do_ioctl(cmd, arg, 1);
884#endif 897#endif
885} 898}
886 899EXPORT_SYMBOL(rtc_control);
887 900
888/* 901/*
889 * The various file operations we support. 902 * The various file operations we support.
@@ -910,11 +923,11 @@ static struct miscdevice rtc_dev = {
910 923
911#ifdef CONFIG_PROC_FS 924#ifdef CONFIG_PROC_FS
912static const struct file_operations rtc_proc_fops = { 925static const struct file_operations rtc_proc_fops = {
913 .owner = THIS_MODULE, 926 .owner = THIS_MODULE,
914 .open = rtc_proc_open, 927 .open = rtc_proc_open,
915 .read = seq_read, 928 .read = seq_read,
916 .llseek = seq_lseek, 929 .llseek = seq_lseek,
917 .release = single_release, 930 .release = single_release,
918}; 931};
919#endif 932#endif
920 933
@@ -965,7 +978,7 @@ static int __init rtc_init(void)
965#ifdef CONFIG_SPARC32 978#ifdef CONFIG_SPARC32
966 for_each_ebus(ebus) { 979 for_each_ebus(ebus) {
967 for_each_ebusdev(edev, ebus) { 980 for_each_ebusdev(edev, ebus) {
968 if(strcmp(edev->prom_node->name, "rtc") == 0) { 981 if (strcmp(edev->prom_node->name, "rtc") == 0) {
969 rtc_port = edev->resource[0].start; 982 rtc_port = edev->resource[0].start;
970 rtc_irq = edev->irqs[0]; 983 rtc_irq = edev->irqs[0];
971 goto found; 984 goto found;
@@ -986,7 +999,8 @@ found:
986 * XXX Interrupt pin #7 in Espresso is shared between RTC and 999 * XXX Interrupt pin #7 in Espresso is shared between RTC and
987 * PCI Slot 2 INTA# (and some INTx# in Slot 1). 1000 * PCI Slot 2 INTA# (and some INTx# in Slot 1).
988 */ 1001 */
989 if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc", (void *)&rtc_port)) { 1002 if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc",
1003 (void *)&rtc_port)) {
990 rtc_has_irq = 0; 1004 rtc_has_irq = 0;
991 printk(KERN_ERR "rtc: cannot register IRQ %d\n", rtc_irq); 1005 printk(KERN_ERR "rtc: cannot register IRQ %d\n", rtc_irq);
992 return -EIO; 1006 return -EIO;
@@ -1015,16 +1029,26 @@ no_irq:
1015 1029
1016#ifdef RTC_IRQ 1030#ifdef RTC_IRQ
1017 if (is_hpet_enabled()) { 1031 if (is_hpet_enabled()) {
1032 int err;
1033
1018 rtc_int_handler_ptr = hpet_rtc_interrupt; 1034 rtc_int_handler_ptr = hpet_rtc_interrupt;
1035 err = hpet_register_irq_handler(rtc_interrupt);
1036 if (err != 0) {
1037 printk(KERN_WARNING "hpet_register_irq_handler failed "
1038 "in rtc_init().");
1039 return err;
1040 }
1019 } else { 1041 } else {
1020 rtc_int_handler_ptr = rtc_interrupt; 1042 rtc_int_handler_ptr = rtc_interrupt;
1021 } 1043 }
1022 1044
1023 if(request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED, "rtc", NULL)) { 1045 if (request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED,
1046 "rtc", NULL)) {
1024 /* Yeah right, seeing as irq 8 doesn't even hit the bus. */ 1047 /* Yeah right, seeing as irq 8 doesn't even hit the bus. */
1025 rtc_has_irq = 0; 1048 rtc_has_irq = 0;
1026 printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ); 1049 printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ);
1027 rtc_release_region(); 1050 rtc_release_region();
1051
1028 return -EIO; 1052 return -EIO;
1029 } 1053 }
1030 hpet_rtc_timer_init(); 1054 hpet_rtc_timer_init();
@@ -1036,6 +1060,7 @@ no_irq:
1036 if (misc_register(&rtc_dev)) { 1060 if (misc_register(&rtc_dev)) {
1037#ifdef RTC_IRQ 1061#ifdef RTC_IRQ
1038 free_irq(RTC_IRQ, NULL); 1062 free_irq(RTC_IRQ, NULL);
1063 hpet_unregister_irq_handler(rtc_interrupt);
1039 rtc_has_irq = 0; 1064 rtc_has_irq = 0;
1040#endif 1065#endif
1041 rtc_release_region(); 1066 rtc_release_region();
@@ -1052,21 +1077,21 @@ no_irq:
1052 1077
1053#if defined(__alpha__) || defined(__mips__) 1078#if defined(__alpha__) || defined(__mips__)
1054 rtc_freq = HZ; 1079 rtc_freq = HZ;
1055 1080
1056 /* Each operating system on an Alpha uses its own epoch. 1081 /* Each operating system on an Alpha uses its own epoch.
1057 Let's try to guess which one we are using now. */ 1082 Let's try to guess which one we are using now. */
1058 1083
1059 if (rtc_is_updating() != 0) 1084 if (rtc_is_updating() != 0)
1060 msleep(20); 1085 msleep(20);
1061 1086
1062 spin_lock_irq(&rtc_lock); 1087 spin_lock_irq(&rtc_lock);
1063 year = CMOS_READ(RTC_YEAR); 1088 year = CMOS_READ(RTC_YEAR);
1064 ctrl = CMOS_READ(RTC_CONTROL); 1089 ctrl = CMOS_READ(RTC_CONTROL);
1065 spin_unlock_irq(&rtc_lock); 1090 spin_unlock_irq(&rtc_lock);
1066 1091
1067 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 1092 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
1068 BCD_TO_BIN(year); /* This should never happen... */ 1093 BCD_TO_BIN(year); /* This should never happen... */
1069 1094
1070 if (year < 20) { 1095 if (year < 20) {
1071 epoch = 2000; 1096 epoch = 2000;
1072 guess = "SRM (post-2000)"; 1097 guess = "SRM (post-2000)";
@@ -1087,7 +1112,8 @@ no_irq:
1087#endif 1112#endif
1088 } 1113 }
1089 if (guess) 1114 if (guess)
1090 printk(KERN_INFO "rtc: %s epoch (%lu) detected\n", guess, epoch); 1115 printk(KERN_INFO "rtc: %s epoch (%lu) detected\n",
1116 guess, epoch);
1091#endif 1117#endif
1092#ifdef RTC_IRQ 1118#ifdef RTC_IRQ
1093 if (rtc_has_irq == 0) 1119 if (rtc_has_irq == 0)
@@ -1096,8 +1122,12 @@ no_irq:
1096 spin_lock_irq(&rtc_lock); 1122 spin_lock_irq(&rtc_lock);
1097 rtc_freq = 1024; 1123 rtc_freq = 1024;
1098 if (!hpet_set_periodic_freq(rtc_freq)) { 1124 if (!hpet_set_periodic_freq(rtc_freq)) {
1099 /* Initialize periodic freq. to CMOS reset default, which is 1024Hz */ 1125 /*
1100 CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT); 1126 * Initialize periodic frequency to CMOS reset default,
1127 * which is 1024Hz
1128 */
1129 CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06),
1130 RTC_FREQ_SELECT);
1101 } 1131 }
1102 spin_unlock_irq(&rtc_lock); 1132 spin_unlock_irq(&rtc_lock);
1103no_irq2: 1133no_irq2:
@@ -1110,20 +1140,22 @@ no_irq2:
1110 return 0; 1140 return 0;
1111} 1141}
1112 1142
1113static void __exit rtc_exit (void) 1143static void __exit rtc_exit(void)
1114{ 1144{
1115 cleanup_sysctl(); 1145 cleanup_sysctl();
1116 remove_proc_entry ("driver/rtc", NULL); 1146 remove_proc_entry("driver/rtc", NULL);
1117 misc_deregister(&rtc_dev); 1147 misc_deregister(&rtc_dev);
1118 1148
1119#ifdef CONFIG_SPARC32 1149#ifdef CONFIG_SPARC32
1120 if (rtc_has_irq) 1150 if (rtc_has_irq)
1121 free_irq (rtc_irq, &rtc_port); 1151 free_irq(rtc_irq, &rtc_port);
1122#else 1152#else
1123 rtc_release_region(); 1153 rtc_release_region();
1124#ifdef RTC_IRQ 1154#ifdef RTC_IRQ
1125 if (rtc_has_irq) 1155 if (rtc_has_irq) {
1126 free_irq (RTC_IRQ, NULL); 1156 free_irq(RTC_IRQ, NULL);
1157 hpet_unregister_irq_handler(hpet_rtc_interrupt);
1158 }
1127#endif 1159#endif
1128#endif /* CONFIG_SPARC32 */ 1160#endif /* CONFIG_SPARC32 */
1129} 1161}
@@ -1133,14 +1165,14 @@ module_exit(rtc_exit);
1133 1165
1134#ifdef RTC_IRQ 1166#ifdef RTC_IRQ
1135/* 1167/*
1136 * At IRQ rates >= 4096Hz, an interrupt may get lost altogether. 1168 * At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
1137 * (usually during an IDE disk interrupt, with IRQ unmasking off) 1169 * (usually during an IDE disk interrupt, with IRQ unmasking off)
1138 * Since the interrupt handler doesn't get called, the IRQ status 1170 * Since the interrupt handler doesn't get called, the IRQ status
1139 * byte doesn't get read, and the RTC stops generating interrupts. 1171 * byte doesn't get read, and the RTC stops generating interrupts.
1140 * A timer is set, and will call this function if/when that happens. 1172 * A timer is set, and will call this function if/when that happens.
1141 * To get it out of this stalled state, we just read the status. 1173 * To get it out of this stalled state, we just read the status.
1142 * At least a jiffy of interrupts (rtc_freq/HZ) will have been lost. 1174 * At least a jiffy of interrupts (rtc_freq/HZ) will have been lost.
1143 * (You *really* shouldn't be trying to use a non-realtime system 1175 * (You *really* shouldn't be trying to use a non-realtime system
1144 * for something that requires a steady > 1KHz signal anyways.) 1176 * for something that requires a steady > 1KHz signal anyways.)
1145 */ 1177 */
1146 1178
@@ -1148,7 +1180,7 @@ static void rtc_dropped_irq(unsigned long data)
1148{ 1180{
1149 unsigned long freq; 1181 unsigned long freq;
1150 1182
1151 spin_lock_irq (&rtc_lock); 1183 spin_lock_irq(&rtc_lock);
1152 1184
1153 if (hpet_rtc_dropped_irq()) { 1185 if (hpet_rtc_dropped_irq()) {
1154 spin_unlock_irq(&rtc_lock); 1186 spin_unlock_irq(&rtc_lock);
@@ -1167,13 +1199,15 @@ static void rtc_dropped_irq(unsigned long data)
1167 1199
1168 spin_unlock_irq(&rtc_lock); 1200 spin_unlock_irq(&rtc_lock);
1169 1201
1170 if (printk_ratelimit()) 1202 if (printk_ratelimit()) {
1171 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); 1203 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
1204 freq);
1205 }
1172 1206
1173 /* Now we have new data */ 1207 /* Now we have new data */
1174 wake_up_interruptible(&rtc_wait); 1208 wake_up_interruptible(&rtc_wait);
1175 1209
1176 kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); 1210 kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
1177} 1211}
1178#endif 1212#endif
1179 1213
@@ -1277,7 +1311,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
1277 * can take just over 2ms. We wait 20ms. There is no need to 1311 * can take just over 2ms. We wait 20ms. There is no need to
1278 * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP. 1312 * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
1279 * If you need to know *exactly* when a second has started, enable 1313 * If you need to know *exactly* when a second has started, enable
1280 * periodic update complete interrupts, (via ioctl) and then 1314 * periodic update complete interrupts, (via ioctl) and then
1281 * immediately read /dev/rtc which will block until you get the IRQ. 1315 * immediately read /dev/rtc which will block until you get the IRQ.
1282 * Once the read clears, read the RTC time (again via ioctl). Easy. 1316 * Once the read clears, read the RTC time (again via ioctl). Easy.
1283 */ 1317 */
@@ -1307,8 +1341,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
1307 ctrl = CMOS_READ(RTC_CONTROL); 1341 ctrl = CMOS_READ(RTC_CONTROL);
1308 spin_unlock_irqrestore(&rtc_lock, flags); 1342 spin_unlock_irqrestore(&rtc_lock, flags);
1309 1343
1310 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 1344 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
1311 {
1312 BCD_TO_BIN(rtc_tm->tm_sec); 1345 BCD_TO_BIN(rtc_tm->tm_sec);
1313 BCD_TO_BIN(rtc_tm->tm_min); 1346 BCD_TO_BIN(rtc_tm->tm_min);
1314 BCD_TO_BIN(rtc_tm->tm_hour); 1347 BCD_TO_BIN(rtc_tm->tm_hour);
@@ -1326,7 +1359,8 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
1326 * Account for differences between how the RTC uses the values 1359 * Account for differences between how the RTC uses the values
1327 * and how they are defined in a struct rtc_time; 1360 * and how they are defined in a struct rtc_time;
1328 */ 1361 */
1329 if ((rtc_tm->tm_year += (epoch - 1900)) <= 69) 1362 rtc_tm->tm_year += epoch - 1900;
1363 if (rtc_tm->tm_year <= 69)
1330 rtc_tm->tm_year += 100; 1364 rtc_tm->tm_year += 100;
1331 1365
1332 rtc_tm->tm_mon--; 1366 rtc_tm->tm_mon--;
@@ -1347,8 +1381,7 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm)
1347 ctrl = CMOS_READ(RTC_CONTROL); 1381 ctrl = CMOS_READ(RTC_CONTROL);
1348 spin_unlock_irq(&rtc_lock); 1382 spin_unlock_irq(&rtc_lock);
1349 1383
1350 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 1384 if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
1351 {
1352 BCD_TO_BIN(alm_tm->tm_sec); 1385 BCD_TO_BIN(alm_tm->tm_sec);
1353 BCD_TO_BIN(alm_tm->tm_min); 1386 BCD_TO_BIN(alm_tm->tm_min);
1354 BCD_TO_BIN(alm_tm->tm_hour); 1387 BCD_TO_BIN(alm_tm->tm_hour);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 5efd5550f4ca..b730d6709529 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1604,7 +1604,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
1604 memcpy(&policy->cpuinfo, &data->cpuinfo, 1604 memcpy(&policy->cpuinfo, &data->cpuinfo,
1605 sizeof(struct cpufreq_cpuinfo)); 1605 sizeof(struct cpufreq_cpuinfo));
1606 1606
1607 if (policy->min > data->min && policy->min > policy->max) { 1607 if (policy->min > data->max || policy->max < data->min) {
1608 ret = -EINVAL; 1608 ret = -EINVAL;
1609 goto error_out; 1609 goto error_out;
1610 } 1610 }
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5e596a7e3601..9008ed5ef4ce 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -8,6 +8,8 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <asm/dmi.h> 9#include <asm/dmi.h>
10 10
11static char dmi_empty_string[] = " ";
12
11static char * __init dmi_string(const struct dmi_header *dm, u8 s) 13static char * __init dmi_string(const struct dmi_header *dm, u8 s)
12{ 14{
13 const u8 *bp = ((u8 *) dm) + dm->length; 15 const u8 *bp = ((u8 *) dm) + dm->length;
@@ -21,11 +23,16 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
21 } 23 }
22 24
23 if (*bp != 0) { 25 if (*bp != 0) {
24 str = dmi_alloc(strlen(bp) + 1); 26 size_t len = strlen(bp)+1;
27 size_t cmp_len = len > 8 ? 8 : len;
28
29 if (!memcmp(bp, dmi_empty_string, cmp_len))
30 return dmi_empty_string;
31 str = dmi_alloc(len);
25 if (str != NULL) 32 if (str != NULL)
26 strcpy(str, bp); 33 strcpy(str, bp);
27 else 34 else
28 printk(KERN_ERR "dmi_string: out of memory.\n"); 35 printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len);
29 } 36 }
30 } 37 }
31 38
@@ -175,12 +182,23 @@ static void __init dmi_save_devices(const struct dmi_header *dm)
175 } 182 }
176} 183}
177 184
185static struct dmi_device empty_oem_string_dev = {
186 .name = dmi_empty_string,
187};
188
178static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) 189static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
179{ 190{
180 int i, count = *(u8 *)(dm + 1); 191 int i, count = *(u8 *)(dm + 1);
181 struct dmi_device *dev; 192 struct dmi_device *dev;
182 193
183 for (i = 1; i <= count; i++) { 194 for (i = 1; i <= count; i++) {
195 char *devname = dmi_string(dm, i);
196
197 if (!strcmp(devname, dmi_empty_string)) {
198 list_add(&empty_oem_string_dev.list, &dmi_devices);
199 continue;
200 }
201
184 dev = dmi_alloc(sizeof(*dev)); 202 dev = dmi_alloc(sizeof(*dev));
185 if (!dev) { 203 if (!dev) {
186 printk(KERN_ERR 204 printk(KERN_ERR
@@ -189,7 +207,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
189 } 207 }
190 208
191 dev->type = DMI_DEV_TYPE_OEM_STRING; 209 dev->type = DMI_DEV_TYPE_OEM_STRING;
192 dev->name = dmi_string(dm, i); 210 dev->name = devname;
193 dev->device_data = NULL; 211 dev->device_data = NULL;
194 212
195 list_add(&dev->list, &dmi_devices); 213 list_add(&dev->list, &dmi_devices);
@@ -331,9 +349,11 @@ void __init dmi_scan_machine(void)
331 rc = dmi_present(q); 349 rc = dmi_present(q);
332 if (!rc) { 350 if (!rc) {
333 dmi_available = 1; 351 dmi_available = 1;
352 dmi_iounmap(p, 0x10000);
334 return; 353 return;
335 } 354 }
336 } 355 }
356 dmi_iounmap(p, 0x10000);
337 } 357 }
338 out: printk(KERN_INFO "DMI not present or invalid.\n"); 358 out: printk(KERN_INFO "DMI not present or invalid.\n");
339} 359}
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile
index 489c133664d5..1f8153b57503 100644
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
15obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o 15obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
16obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o 16obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
17 17
18obj-$(CONFIG_PROVIDE_OHCI1394_DMA_INIT) += init_ohci1394_dma.o
diff --git a/drivers/ieee1394/init_ohci1394_dma.c b/drivers/ieee1394/init_ohci1394_dma.c
new file mode 100644
index 000000000000..ddaab6eb8ace
--- /dev/null
+++ b/drivers/ieee1394/init_ohci1394_dma.c
@@ -0,0 +1,285 @@
1/*
2 * init_ohci1394_dma.c - Initializes physical DMA on all OHCI 1394 controllers
3 *
4 * Copyright (C) 2006-2007 Bernhard Kaindl <bk@suse.de>
5 *
6 * Derived from drivers/ieee1394/ohci1394.c and arch/x86/kernel/early-quirks.c
7 * this file has functions to:
8 * - scan the PCI very early on boot for all OHCI 1394-compliant controllers
9 * - reset and initialize them and make them join the IEEE1394 bus and
10 * - enable physical DMA on them to allow remote debugging
11 *
12 * All code and data is marked as __init and __initdata, respective as
13 * during boot, all OHCI1394 controllers may be claimed by the firewire
14 * stack and at this point, this code should not touch them anymore.
15 *
16 * To use physical DMA after the initialization of the firewire stack,
17 * be sure that the stack enables it and (re-)attach after the bus reset
18 * which may be caused by the firewire stack initialization.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software Foundation,
32 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
33 */
34
35#include <linux/interrupt.h> /* for ohci1394.h */
36#include <linux/delay.h>
37#include <linux/pci.h> /* for PCI defines */
38#include <linux/init_ohci1394_dma.h>
39#include <asm/pci-direct.h> /* for direct PCI config space access */
40#include <asm/fixmap.h>
41
42#include "ieee1394_types.h"
43#include "ohci1394.h"
44
45int __initdata init_ohci1394_dma_early;
46
47/* Reads a PHY register of an OHCI-1394 controller */
48static inline u8 __init get_phy_reg(struct ti_ohci *ohci, u8 addr)
49{
50 int i;
51 quadlet_t r;
52
53 reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | 0x00008000);
54
55 for (i = 0; i < OHCI_LOOP_COUNT; i++) {
56 if (reg_read(ohci, OHCI1394_PhyControl) & 0x80000000)
57 break;
58 mdelay(1);
59 }
60 r = reg_read(ohci, OHCI1394_PhyControl);
61
62 return (r & 0x00ff0000) >> 16;
63}
64
65/* Writes to a PHY register of an OHCI-1394 controller */
66static inline void __init set_phy_reg(struct ti_ohci *ohci, u8 addr, u8 data)
67{
68 int i;
69
70 reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | data | 0x00004000);
71
72 for (i = 0; i < OHCI_LOOP_COUNT; i++) {
73 u32 r = reg_read(ohci, OHCI1394_PhyControl);
74 if (!(r & 0x00004000))
75 break;
76 mdelay(1);
77 }
78}
79
80/* Resets an OHCI-1394 controller (for sane state before initialization) */
81static inline void __init init_ohci1394_soft_reset(struct ti_ohci *ohci) {
82 int i;
83
84 reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_softReset);
85
86 for (i = 0; i < OHCI_LOOP_COUNT; i++) {
87 if (!(reg_read(ohci, OHCI1394_HCControlSet)
88 & OHCI1394_HCControl_softReset))
89 break;
90 mdelay(1);
91 }
92}
93
94/* Basic OHCI-1394 register and port inititalization */
95static inline void __init init_ohci1394_initialize(struct ti_ohci *ohci)
96{
97 quadlet_t bus_options;
98 int num_ports, i;
99
100 /* Put some defaults to these undefined bus options */
101 bus_options = reg_read(ohci, OHCI1394_BusOptions);
102 bus_options |= 0x60000000; /* Enable CMC and ISC */
103 bus_options &= ~0x00ff0000; /* XXX: Set cyc_clk_acc to zero for now */
104 bus_options &= ~0x18000000; /* Disable PMC and BMC */
105 reg_write(ohci, OHCI1394_BusOptions, bus_options);
106
107 /* Set the bus number */
108 reg_write(ohci, OHCI1394_NodeID, 0x0000ffc0);
109
110 /* Enable posted writes */
111 reg_write(ohci, OHCI1394_HCControlSet,
112 OHCI1394_HCControl_postedWriteEnable);
113
114 /* Clear link control register */
115 reg_write(ohci, OHCI1394_LinkControlClear, 0xffffffff);
116
117 /* enable phys */
118 reg_write(ohci, OHCI1394_LinkControlSet,
119 OHCI1394_LinkControl_RcvPhyPkt);
120
121 /* Don't accept phy packets into AR request context */
122 reg_write(ohci, OHCI1394_LinkControlClear, 0x00000400);
123
124 /* Clear the Isochonouys interrupt masks */
125 reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 0xffffffff);
126 reg_write(ohci, OHCI1394_IsoRecvIntEventClear, 0xffffffff);
127 reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 0xffffffff);
128 reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 0xffffffff);
129
130 /* Accept asyncronous transfer requests from all nodes for now */
131 reg_write(ohci,OHCI1394_AsReqFilterHiSet, 0x80000000);
132
133 /* Specify asyncronous transfer retries */
134 reg_write(ohci, OHCI1394_ATRetries,
135 OHCI1394_MAX_AT_REQ_RETRIES |
136 (OHCI1394_MAX_AT_RESP_RETRIES<<4) |
137 (OHCI1394_MAX_PHYS_RESP_RETRIES<<8));
138
139 /* We don't want hardware swapping */
140 reg_write(ohci, OHCI1394_HCControlClear, OHCI1394_HCControl_noByteSwap);
141
142 /* Enable link */
143 reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_linkEnable);
144
145 /* If anything is connected to a port, make sure it is enabled */
146 num_ports = get_phy_reg(ohci, 2) & 0xf;
147 for (i = 0; i < num_ports; i++) {
148 unsigned int status;
149
150 set_phy_reg(ohci, 7, i);
151 status = get_phy_reg(ohci, 8);
152
153 if (status & 0x20)
154 set_phy_reg(ohci, 8, status & ~1);
155 }
156}
157
158/**
159 * init_ohci1394_wait_for_busresets - wait until bus resets are completed
160 *
161 * OHCI1394 initialization itself and any device going on- or offline
162 * and any cable issue cause a IEEE1394 bus reset. The OHCI1394 spec
163 * specifies that physical DMA is disabled on each bus reset and it
164 * has to be enabled after each bus reset when needed. We resort
165 * to polling here because on early boot, we have no interrupts.
166 */
167static inline void __init init_ohci1394_wait_for_busresets(struct ti_ohci *ohci)
168{
169 int i, events;
170
171 for (i=0; i < 9; i++) {
172 mdelay(200);
173 events = reg_read(ohci, OHCI1394_IntEventSet);
174 if (events & OHCI1394_busReset)
175 reg_write(ohci, OHCI1394_IntEventClear,
176 OHCI1394_busReset);
177 }
178}
179
180/**
181 * init_ohci1394_enable_physical_dma - Enable physical DMA for remote debugging
182 * This enables remote DMA access over IEEE1394 from every host for the low
183 * 4GB of address space. DMA accesses above 4GB are not available currently.
184 */
185static inline void __init init_ohci1394_enable_physical_dma(struct ti_ohci *hci)
186{
187 reg_write(hci, OHCI1394_PhyReqFilterHiSet, 0xffffffff);
188 reg_write(hci, OHCI1394_PhyReqFilterLoSet, 0xffffffff);
189 reg_write(hci, OHCI1394_PhyUpperBound, 0xffff0000);
190}
191
192/**
193 * init_ohci1394_reset_and_init_dma - init controller and enable DMA
194 * This initializes the given controller and enables physical DMA engine in it.
195 */
196static inline void __init init_ohci1394_reset_and_init_dma(struct ti_ohci *ohci)
197{
198 /* Start off with a soft reset, clears everything to a sane state. */
199 init_ohci1394_soft_reset(ohci);
200
201 /* Accessing some registers without LPS enabled may cause lock up */
202 reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_LPS);
203
204 /* Disable and clear interrupts */
205 reg_write(ohci, OHCI1394_IntEventClear, 0xffffffff);
206 reg_write(ohci, OHCI1394_IntMaskClear, 0xffffffff);
207
208 mdelay(50); /* Wait 50msec to make sure we have full link enabled */
209
210 init_ohci1394_initialize(ohci);
211 /*
212 * The initialization causes at least one IEEE1394 bus reset. Enabling
213 * physical DMA only works *after* *all* bus resets have calmed down:
214 */
215 init_ohci1394_wait_for_busresets(ohci);
216
217 /* We had to wait and do this now if we want to debug early problems */
218 init_ohci1394_enable_physical_dma(ohci);
219}
220
221/**
222 * init_ohci1394_controller - Map the registers of the controller and init DMA
223 * This maps the registers of the specified controller and initializes it
224 */
225static inline void __init init_ohci1394_controller(int num, int slot, int func)
226{
227 unsigned long ohci_base;
228 struct ti_ohci ohci;
229
230 printk(KERN_INFO "init_ohci1394_dma: initializing OHCI-1394"
231 " at %02x:%02x.%x\n", num, slot, func);
232
233 ohci_base = read_pci_config(num, slot, func, PCI_BASE_ADDRESS_0+(0<<2))
234 & PCI_BASE_ADDRESS_MEM_MASK;
235
236 set_fixmap_nocache(FIX_OHCI1394_BASE, ohci_base);
237
238 ohci.registers = (void *)fix_to_virt(FIX_OHCI1394_BASE);
239
240 init_ohci1394_reset_and_init_dma(&ohci);
241}
242
243/**
244 * debug_init_ohci1394_dma - scan for OHCI1394 controllers and init DMA on them
245 * Scans the whole PCI space for OHCI1394 controllers and inits DMA on them
246 */
247void __init init_ohci1394_dma_on_all_controllers(void)
248{
249 int num, slot, func;
250
251 if (!early_pci_allowed())
252 return;
253
254 /* Poor man's PCI discovery, the only thing we can do at early boot */
255 for (num = 0; num < 32; num++) {
256 for (slot = 0; slot < 32; slot++) {
257 for (func = 0; func < 8; func++) {
258 u32 class = read_pci_config(num,slot,func,
259 PCI_CLASS_REVISION);
260 if ((class == 0xffffffff))
261 continue; /* No device at this func */
262
263 if (class>>8 != PCI_CLASS_SERIAL_FIREWIRE_OHCI)
264 continue; /* Not an OHCI-1394 device */
265
266 init_ohci1394_controller(num, slot, func);
267 break; /* Assume one controller per device */
268 }
269 }
270 }
271 printk(KERN_INFO "init_ohci1394_dma: finished initializing OHCI DMA\n");
272}
273
274/**
275 * setup_init_ohci1394_early - enables early OHCI1394 DMA initialization
276 */
277static int __init setup_ohci1394_dma(char *opt)
278{
279 if (!strcmp(opt, "early"))
280 init_ohci1394_dma_early = 1;
281 return 0;
282}
283
284/* passing ohci1394_dma=early on boot causes early OHCI1394 DMA initialization */
285early_param("ohci1394_dma", setup_ohci1394_dma);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f2d2c7e2c76b..195ce7c12319 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = {
1571 .this_id = -1, 1571 .this_id = -1,
1572 .cmd_per_lun = SRP_SQ_SIZE, 1572 .cmd_per_lun = SRP_SQ_SIZE,
1573 .use_clustering = ENABLE_CLUSTERING, 1573 .use_clustering = ENABLE_CLUSTERING,
1574 .use_sg_chaining = ENABLE_SG_CHAINING,
1575 .shost_attrs = srp_host_attrs 1574 .shost_attrs = srp_host_attrs
1576}; 1575};
1577 1576
diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c
index 8991ab0b4fe3..61cff8374e6c 100644
--- a/drivers/input/mouse/pc110pad.c
+++ b/drivers/input/mouse/pc110pad.c
@@ -39,6 +39,7 @@
39#include <linux/init.h> 39#include <linux/init.h>
40#include <linux/interrupt.h> 40#include <linux/interrupt.h>
41#include <linux/pci.h> 41#include <linux/pci.h>
42#include <linux/delay.h>
42 43
43#include <asm/io.h> 44#include <asm/io.h>
44#include <asm/irq.h> 45#include <asm/irq.h>
@@ -62,8 +63,10 @@ static irqreturn_t pc110pad_interrupt(int irq, void *ptr)
62 int value = inb_p(pc110pad_io); 63 int value = inb_p(pc110pad_io);
63 int handshake = inb_p(pc110pad_io + 2); 64 int handshake = inb_p(pc110pad_io + 2);
64 65
65 outb_p(handshake | 1, pc110pad_io + 2); 66 outb(handshake | 1, pc110pad_io + 2);
66 outb_p(handshake & ~1, pc110pad_io + 2); 67 udelay(2);
68 outb(handshake & ~1, pc110pad_io + 2);
69 udelay(2);
67 inb_p(0x64); 70 inb_p(0x64);
68 71
69 pc110pad_data[pc110pad_count++] = value; 72 pc110pad_data[pc110pad_count++] = value;
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b30..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5d..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
23#include <linux/types.h>
24#include <linux/string.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/module.h>
28
29#include <asm/page.h>
30#include <asm/cmpxchg.h>
31
32#undef MMU_DEBUG
33
34#undef AUDIT
35
36#ifdef AUDIT
37static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38#else
39static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40#endif
41
42#ifdef MMU_DEBUG
43
44#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47#else
48
49#define pgprintk(x...) do { } while (0)
50#define rmap_printk(x...) do { } while (0)
51
52#endif
53
54#if defined(MMU_DEBUG) || defined(AUDIT)
55static int dbg = 1;
56#endif
57
58#ifndef MMU_DEBUG
59#define ASSERT(x) do { } while (0)
60#else
61#define ASSERT(x) \
62 if (!(x)) { \
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
65 }
66#endif
67
68#define PT64_PT_BITS 9
69#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70#define PT32_PT_BITS 10
71#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73#define PT_WRITABLE_SHIFT 1
74
75#define PT_PRESENT_MASK (1ULL << 0)
76#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77#define PT_USER_MASK (1ULL << 2)
78#define PT_PWT_MASK (1ULL << 3)
79#define PT_PCD_MASK (1ULL << 4)
80#define PT_ACCESSED_MASK (1ULL << 5)
81#define PT_DIRTY_MASK (1ULL << 6)
82#define PT_PAGE_SIZE_MASK (1ULL << 7)
83#define PT_PAT_MASK (1ULL << 7)
84#define PT_GLOBAL_MASK (1ULL << 8)
85#define PT64_NX_MASK (1ULL << 63)
86
87#define PT_PAT_SHIFT 7
88#define PT_DIR_PAT_SHIFT 12
89#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91#define PT32_DIR_PSE36_SIZE 4
92#define PT32_DIR_PSE36_SHIFT 13
93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
94
95
96#define PT_FIRST_AVAIL_BITS_SHIFT 9
97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
98
99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100
101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
102
103#define PT64_LEVEL_BITS 9
104
105#define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
107
108#define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
110
111#define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
113
114
115#define PT32_LEVEL_BITS 10
116
117#define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
119
120#define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
122
123#define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128#define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
130
131#define PT32_BASE_ADDR_MASK PAGE_MASK
132#define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134
135
136#define PFERR_PRESENT_MASK (1U << 0)
137#define PFERR_WRITE_MASK (1U << 1)
138#define PFERR_USER_MASK (1U << 2)
139#define PFERR_FETCH_MASK (1U << 4)
140
141#define PT64_ROOT_LEVEL 4
142#define PT32_ROOT_LEVEL 2
143#define PT32E_ROOT_LEVEL 3
144
145#define PT_DIRECTORY_LEVEL 2
146#define PT_PAGE_TABLE_LEVEL 1
147
148#define RMAP_EXT 4
149
150struct kvm_rmap_desc {
151 u64 *shadow_ptes[RMAP_EXT];
152 struct kvm_rmap_desc *more;
153};
154
155static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache;
158
159static int is_write_protection(struct kvm_vcpu *vcpu)
160{
161 return vcpu->cr0 & X86_CR0_WP;
162}
163
164static int is_cpuid_PSE36(void)
165{
166 return 1;
167}
168
169static int is_nx(struct kvm_vcpu *vcpu)
170{
171 return vcpu->shadow_efer & EFER_NX;
172}
173
174static int is_present_pte(unsigned long pte)
175{
176 return pte & PT_PRESENT_MASK;
177}
178
179static int is_writeble_pte(unsigned long pte)
180{
181 return pte & PT_WRITABLE_MASK;
182}
183
184static int is_io_pte(unsigned long pte)
185{
186 return pte & PT_SHADOW_IO_MARK;
187}
188
189static int is_rmap_pte(u64 pte)
190{
191 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
192 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
193}
194
195static void set_shadow_pte(u64 *sptep, u64 spte)
196{
197#ifdef CONFIG_X86_64
198 set_64bit((unsigned long *)sptep, spte);
199#else
200 set_64bit((unsigned long long *)sptep, spte);
201#endif
202}
203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min)
206{
207 void *obj;
208
209 if (cache->nobjs >= min)
210 return 0;
211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
213 if (!obj)
214 return -ENOMEM;
215 cache->objects[cache->nobjs++] = obj;
216 }
217 return 0;
218}
219
220static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221{
222 while (mc->nobjs)
223 kfree(mc->objects[--mc->nobjs]);
224}
225
226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
227 int min)
228{
229 struct page *page;
230
231 if (cache->nobjs >= min)
232 return 0;
233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
234 page = alloc_page(GFP_KERNEL);
235 if (!page)
236 return -ENOMEM;
237 set_page_private(page, 0);
238 cache->objects[cache->nobjs++] = page_address(page);
239 }
240 return 0;
241}
242
243static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
244{
245 while (mc->nobjs)
246 free_page((unsigned long)mc->objects[--mc->nobjs]);
247}
248
249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
250{
251 int r;
252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4);
256 if (r)
257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1);
260 if (r)
261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r)
264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4);
267out:
268 return r;
269}
270
271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
272{
273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
274 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
275 mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
276 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
277}
278
279static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
280 size_t size)
281{
282 void *p;
283
284 BUG_ON(!mc->nobjs);
285 p = mc->objects[--mc->nobjs];
286 memset(p, 0, size);
287 return p;
288}
289
290static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
291{
292 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
293 sizeof(struct kvm_pte_chain));
294}
295
296static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
297{
298 kfree(pc);
299}
300
301static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
302{
303 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
304 sizeof(struct kvm_rmap_desc));
305}
306
307static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
308{
309 kfree(rd);
310}
311
312/*
313 * Reverse mapping data structures:
314 *
315 * If page->private bit zero is zero, then page->private points to the
316 * shadow page table entry that points to page_address(page).
317 *
318 * If page->private bit zero is one, (then page->private & ~1) points
319 * to a struct kvm_rmap_desc containing more mappings.
320 */
321static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
322{
323 struct page *page;
324 struct kvm_rmap_desc *desc;
325 int i;
326
327 if (!is_rmap_pte(*spte))
328 return;
329 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
330 if (!page_private(page)) {
331 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
332 set_page_private(page,(unsigned long)spte);
333 } else if (!(page_private(page) & 1)) {
334 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
335 desc = mmu_alloc_rmap_desc(vcpu);
336 desc->shadow_ptes[0] = (u64 *)page_private(page);
337 desc->shadow_ptes[1] = spte;
338 set_page_private(page,(unsigned long)desc | 1);
339 } else {
340 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
341 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
342 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
343 desc = desc->more;
344 if (desc->shadow_ptes[RMAP_EXT-1]) {
345 desc->more = mmu_alloc_rmap_desc(vcpu);
346 desc = desc->more;
347 }
348 for (i = 0; desc->shadow_ptes[i]; ++i)
349 ;
350 desc->shadow_ptes[i] = spte;
351 }
352}
353
354static void rmap_desc_remove_entry(struct page *page,
355 struct kvm_rmap_desc *desc,
356 int i,
357 struct kvm_rmap_desc *prev_desc)
358{
359 int j;
360
361 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
362 ;
363 desc->shadow_ptes[i] = desc->shadow_ptes[j];
364 desc->shadow_ptes[j] = NULL;
365 if (j != 0)
366 return;
367 if (!prev_desc && !desc->more)
368 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
369 else
370 if (prev_desc)
371 prev_desc->more = desc->more;
372 else
373 set_page_private(page,(unsigned long)desc->more | 1);
374 mmu_free_rmap_desc(desc);
375}
376
377static void rmap_remove(u64 *spte)
378{
379 struct page *page;
380 struct kvm_rmap_desc *desc;
381 struct kvm_rmap_desc *prev_desc;
382 int i;
383
384 if (!is_rmap_pte(*spte))
385 return;
386 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
387 if (!page_private(page)) {
388 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
389 BUG();
390 } else if (!(page_private(page) & 1)) {
391 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
392 if ((u64 *)page_private(page) != spte) {
393 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
394 spte, *spte);
395 BUG();
396 }
397 set_page_private(page,0);
398 } else {
399 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
401 prev_desc = NULL;
402 while (desc) {
403 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
404 if (desc->shadow_ptes[i] == spte) {
405 rmap_desc_remove_entry(page,
406 desc, i,
407 prev_desc);
408 return;
409 }
410 prev_desc = desc;
411 desc = desc->more;
412 }
413 BUG();
414 }
415}
416
417static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
418{
419 struct kvm *kvm = vcpu->kvm;
420 struct page *page;
421 struct kvm_rmap_desc *desc;
422 u64 *spte;
423
424 page = gfn_to_page(kvm, gfn);
425 BUG_ON(!page);
426
427 while (page_private(page)) {
428 if (!(page_private(page) & 1))
429 spte = (u64 *)page_private(page);
430 else {
431 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
432 spte = desc->shadow_ptes[0];
433 }
434 BUG_ON(!spte);
435 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
436 != page_to_pfn(page));
437 BUG_ON(!(*spte & PT_PRESENT_MASK));
438 BUG_ON(!(*spte & PT_WRITABLE_MASK));
439 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
440 rmap_remove(spte);
441 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
442 kvm_flush_remote_tlbs(vcpu->kvm);
443 }
444}
445
446#ifdef MMU_DEBUG
447static int is_empty_shadow_page(u64 *spt)
448{
449 u64 *pos;
450 u64 *end;
451
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos);
456 return 0;
457 }
458 return 1;
459}
460#endif
461
462static void kvm_mmu_free_page(struct kvm *kvm,
463 struct kvm_mmu_page *page_head)
464{
465 ASSERT(is_empty_shadow_page(page_head->spt));
466 list_del(&page_head->link);
467 __free_page(virt_to_page(page_head->spt));
468 kfree(page_head);
469 ++kvm->n_free_mmu_pages;
470}
471
472static unsigned kvm_page_table_hashfn(gfn_t gfn)
473{
474 return gfn;
475}
476
477static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
478 u64 *parent_pte)
479{
480 struct kvm_mmu_page *page;
481
482 if (!vcpu->kvm->n_free_mmu_pages)
483 return NULL;
484
485 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
486 sizeof *page);
487 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
488 set_page_private(virt_to_page(page->spt), (unsigned long)page);
489 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
490 ASSERT(is_empty_shadow_page(page->spt));
491 page->slot_bitmap = 0;
492 page->multimapped = 0;
493 page->parent_pte = parent_pte;
494 --vcpu->kvm->n_free_mmu_pages;
495 return page;
496}
497
498static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
499 struct kvm_mmu_page *page, u64 *parent_pte)
500{
501 struct kvm_pte_chain *pte_chain;
502 struct hlist_node *node;
503 int i;
504
505 if (!parent_pte)
506 return;
507 if (!page->multimapped) {
508 u64 *old = page->parent_pte;
509
510 if (!old) {
511 page->parent_pte = parent_pte;
512 return;
513 }
514 page->multimapped = 1;
515 pte_chain = mmu_alloc_pte_chain(vcpu);
516 INIT_HLIST_HEAD(&page->parent_ptes);
517 hlist_add_head(&pte_chain->link, &page->parent_ptes);
518 pte_chain->parent_ptes[0] = old;
519 }
520 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
521 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
522 continue;
523 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
524 if (!pte_chain->parent_ptes[i]) {
525 pte_chain->parent_ptes[i] = parent_pte;
526 return;
527 }
528 }
529 pte_chain = mmu_alloc_pte_chain(vcpu);
530 BUG_ON(!pte_chain);
531 hlist_add_head(&pte_chain->link, &page->parent_ptes);
532 pte_chain->parent_ptes[0] = parent_pte;
533}
534
535static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
536 u64 *parent_pte)
537{
538 struct kvm_pte_chain *pte_chain;
539 struct hlist_node *node;
540 int i;
541
542 if (!page->multimapped) {
543 BUG_ON(page->parent_pte != parent_pte);
544 page->parent_pte = NULL;
545 return;
546 }
547 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
548 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
549 if (!pte_chain->parent_ptes[i])
550 break;
551 if (pte_chain->parent_ptes[i] != parent_pte)
552 continue;
553 while (i + 1 < NR_PTE_CHAIN_ENTRIES
554 && pte_chain->parent_ptes[i + 1]) {
555 pte_chain->parent_ptes[i]
556 = pte_chain->parent_ptes[i + 1];
557 ++i;
558 }
559 pte_chain->parent_ptes[i] = NULL;
560 if (i == 0) {
561 hlist_del(&pte_chain->link);
562 mmu_free_pte_chain(pte_chain);
563 if (hlist_empty(&page->parent_ptes)) {
564 page->multimapped = 0;
565 page->parent_pte = NULL;
566 }
567 }
568 return;
569 }
570 BUG();
571}
572
573static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
574 gfn_t gfn)
575{
576 unsigned index;
577 struct hlist_head *bucket;
578 struct kvm_mmu_page *page;
579 struct hlist_node *node;
580
581 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
582 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
583 bucket = &vcpu->kvm->mmu_page_hash[index];
584 hlist_for_each_entry(page, node, bucket, hash_link)
585 if (page->gfn == gfn && !page->role.metaphysical) {
586 pgprintk("%s: found role %x\n",
587 __FUNCTION__, page->role.word);
588 return page;
589 }
590 return NULL;
591}
592
593static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
594 gfn_t gfn,
595 gva_t gaddr,
596 unsigned level,
597 int metaphysical,
598 unsigned hugepage_access,
599 u64 *parent_pte)
600{
601 union kvm_mmu_page_role role;
602 unsigned index;
603 unsigned quadrant;
604 struct hlist_head *bucket;
605 struct kvm_mmu_page *page;
606 struct hlist_node *node;
607
608 role.word = 0;
609 role.glevels = vcpu->mmu.root_level;
610 role.level = level;
611 role.metaphysical = metaphysical;
612 role.hugepage_access = hugepage_access;
613 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
614 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
615 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
616 role.quadrant = quadrant;
617 }
618 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
619 gfn, role.word);
620 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
621 bucket = &vcpu->kvm->mmu_page_hash[index];
622 hlist_for_each_entry(page, node, bucket, hash_link)
623 if (page->gfn == gfn && page->role.word == role.word) {
624 mmu_page_add_parent_pte(vcpu, page, parent_pte);
625 pgprintk("%s: found\n", __FUNCTION__);
626 return page;
627 }
628 page = kvm_mmu_alloc_page(vcpu, parent_pte);
629 if (!page)
630 return page;
631 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
632 page->gfn = gfn;
633 page->role = role;
634 hlist_add_head(&page->hash_link, bucket);
635 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn);
637 return page;
638}
639
640static void kvm_mmu_page_unlink_children(struct kvm *kvm,
641 struct kvm_mmu_page *page)
642{
643 unsigned i;
644 u64 *pt;
645 u64 ent;
646
647 pt = page->spt;
648
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK)
652 rmap_remove(&pt[i]);
653 pt[i] = 0;
654 }
655 kvm_flush_remote_tlbs(kvm);
656 return;
657 }
658
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i];
661
662 pt[i] = 0;
663 if (!(ent & PT_PRESENT_MASK))
664 continue;
665 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
667 }
668 kvm_flush_remote_tlbs(kvm);
669}
670
671static void kvm_mmu_put_page(struct kvm_mmu_page *page,
672 u64 *parent_pte)
673{
674 mmu_page_remove_parent_pte(page, parent_pte);
675}
676
677static void kvm_mmu_zap_page(struct kvm *kvm,
678 struct kvm_mmu_page *page)
679{
680 u64 *parent_pte;
681
682 while (page->multimapped || page->parent_pte) {
683 if (!page->multimapped)
684 parent_pte = page->parent_pte;
685 else {
686 struct kvm_pte_chain *chain;
687
688 chain = container_of(page->parent_ptes.first,
689 struct kvm_pte_chain, link);
690 parent_pte = chain->parent_ptes[0];
691 }
692 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0);
695 }
696 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) {
698 hlist_del(&page->hash_link);
699 kvm_mmu_free_page(kvm, page);
700 } else
701 list_move(&page->link, &kvm->active_mmu_pages);
702}
703
704static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
705{
706 unsigned index;
707 struct hlist_head *bucket;
708 struct kvm_mmu_page *page;
709 struct hlist_node *node, *n;
710 int r;
711
712 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
713 r = 0;
714 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
715 bucket = &vcpu->kvm->mmu_page_hash[index];
716 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
717 if (page->gfn == gfn && !page->role.metaphysical) {
718 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
719 page->role.word);
720 kvm_mmu_zap_page(vcpu->kvm, page);
721 r = 1;
722 }
723 return r;
724}
725
726static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
727{
728 struct kvm_mmu_page *page;
729
730 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
731 pgprintk("%s: zap %lx %x\n",
732 __FUNCTION__, gfn, page->role.word);
733 kvm_mmu_zap_page(vcpu->kvm, page);
734 }
735}
736
737static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
738{
739 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
740 struct kvm_mmu_page *page_head = page_header(__pa(pte));
741
742 __set_bit(slot, &page_head->slot_bitmap);
743}
744
745hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
746{
747 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
748
749 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
750}
751
752hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
753{
754 struct page *page;
755
756 ASSERT((gpa & HPA_ERR_MASK) == 0);
757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
758 if (!page)
759 return gpa | HPA_ERR_MASK;
760 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
761 | (gpa & (PAGE_SIZE-1));
762}
763
764hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
765{
766 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
767
768 if (gpa == UNMAPPED_GVA)
769 return UNMAPPED_GVA;
770 return gpa_to_hpa(vcpu, gpa);
771}
772
773struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
774{
775 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
776
777 if (gpa == UNMAPPED_GVA)
778 return NULL;
779 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
780}
781
782static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783{
784}
785
786static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
787{
788 int level = PT32E_ROOT_LEVEL;
789 hpa_t table_addr = vcpu->mmu.root_hpa;
790
791 for (; ; level--) {
792 u32 index = PT64_INDEX(v, level);
793 u64 *table;
794 u64 pte;
795
796 ASSERT(VALID_PAGE(table_addr));
797 table = __va(table_addr);
798
799 if (level == 1) {
800 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte))
802 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v);
805 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
806 PT_USER_MASK;
807 rmap_add(vcpu, &table[index]);
808 return 0;
809 }
810
811 if (table[index] == 0) {
812 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn;
814
815 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
816 >> PAGE_SHIFT;
817 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
818 v, level - 1,
819 1, 0, &table[index]);
820 if (!new_table) {
821 pgprintk("nonpaging_map: ENOMEM\n");
822 return -ENOMEM;
823 }
824
825 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
826 | PT_WRITABLE_MASK | PT_USER_MASK;
827 }
828 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 }
830}
831
832static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{
834 int i;
835 struct kvm_mmu_page *page;
836
837 if (!VALID_PAGE(vcpu->mmu.root_hpa))
838 return;
839#ifdef CONFIG_X86_64
840 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
841 hpa_t root = vcpu->mmu.root_hpa;
842
843 page = page_header(root);
844 --page->root_count;
845 vcpu->mmu.root_hpa = INVALID_PAGE;
846 return;
847 }
848#endif
849 for (i = 0; i < 4; ++i) {
850 hpa_t root = vcpu->mmu.pae_root[i];
851
852 if (root) {
853 root &= PT64_BASE_ADDR_MASK;
854 page = page_header(root);
855 --page->root_count;
856 }
857 vcpu->mmu.pae_root[i] = INVALID_PAGE;
858 }
859 vcpu->mmu.root_hpa = INVALID_PAGE;
860}
861
862static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
863{
864 int i;
865 gfn_t root_gfn;
866 struct kvm_mmu_page *page;
867
868 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
869
870#ifdef CONFIG_X86_64
871 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
872 hpa_t root = vcpu->mmu.root_hpa;
873
874 ASSERT(!VALID_PAGE(root));
875 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
876 PT64_ROOT_LEVEL, 0, 0, NULL);
877 root = __pa(page->spt);
878 ++page->root_count;
879 vcpu->mmu.root_hpa = root;
880 return;
881 }
882#endif
883 for (i = 0; i < 4; ++i) {
884 hpa_t root = vcpu->mmu.pae_root[i];
885
886 ASSERT(!VALID_PAGE(root));
887 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
888 if (!is_present_pte(vcpu->pdptrs[i])) {
889 vcpu->mmu.pae_root[i] = 0;
890 continue;
891 }
892 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
893 } else if (vcpu->mmu.root_level == 0)
894 root_gfn = 0;
895 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
896 PT32_ROOT_LEVEL, !is_paging(vcpu),
897 0, NULL);
898 root = __pa(page->spt);
899 ++page->root_count;
900 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
901 }
902 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
903}
904
905static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
906{
907 return vaddr;
908}
909
910static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
911 u32 error_code)
912{
913 gpa_t addr = gva;
914 hpa_t paddr;
915 int r;
916
917 r = mmu_topup_memory_caches(vcpu);
918 if (r)
919 return r;
920
921 ASSERT(vcpu);
922 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
923
924
925 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
926
927 if (is_error_hpa(paddr))
928 return 1;
929
930 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
931}
932
933static void nonpaging_free(struct kvm_vcpu *vcpu)
934{
935 mmu_free_roots(vcpu);
936}
937
938static int nonpaging_init_context(struct kvm_vcpu *vcpu)
939{
940 struct kvm_mmu *context = &vcpu->mmu;
941
942 context->new_cr3 = nonpaging_new_cr3;
943 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free;
946 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE;
949 return 0;
950}
951
952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953{
954 ++vcpu->stat.tlb_flush;
955 kvm_x86_ops->tlb_flush(vcpu);
956}
957
958static void paging_new_cr3(struct kvm_vcpu *vcpu)
959{
960 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
961 mmu_free_roots(vcpu);
962}
963
964static void inject_page_fault(struct kvm_vcpu *vcpu,
965 u64 addr,
966 u32 err_code)
967{
968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
969}
970
971static void paging_free(struct kvm_vcpu *vcpu)
972{
973 nonpaging_free(vcpu);
974}
975
976#define PTTYPE 64
977#include "paging_tmpl.h"
978#undef PTTYPE
979
980#define PTTYPE 32
981#include "paging_tmpl.h"
982#undef PTTYPE
983
984static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
985{
986 struct kvm_mmu *context = &vcpu->mmu;
987
988 ASSERT(is_pae(vcpu));
989 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa;
992 context->free = paging_free;
993 context->root_level = level;
994 context->shadow_root_level = level;
995 context->root_hpa = INVALID_PAGE;
996 return 0;
997}
998
999static int paging64_init_context(struct kvm_vcpu *vcpu)
1000{
1001 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1002}
1003
1004static int paging32_init_context(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_mmu *context = &vcpu->mmu;
1007
1008 context->new_cr3 = paging_new_cr3;
1009 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free;
1012 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE;
1015 return 0;
1016}
1017
1018static int paging32E_init_context(struct kvm_vcpu *vcpu)
1019{
1020 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1021}
1022
1023static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1024{
1025 ASSERT(vcpu);
1026 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1027
1028 if (!is_paging(vcpu))
1029 return nonpaging_init_context(vcpu);
1030 else if (is_long_mode(vcpu))
1031 return paging64_init_context(vcpu);
1032 else if (is_pae(vcpu))
1033 return paging32E_init_context(vcpu);
1034 else
1035 return paging32_init_context(vcpu);
1036}
1037
1038static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1039{
1040 ASSERT(vcpu);
1041 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1042 vcpu->mmu.free(vcpu);
1043 vcpu->mmu.root_hpa = INVALID_PAGE;
1044 }
1045}
1046
1047int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1048{
1049 destroy_kvm_mmu(vcpu);
1050 return init_kvm_mmu(vcpu);
1051}
1052EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1053
1054int kvm_mmu_load(struct kvm_vcpu *vcpu)
1055{
1056 int r;
1057
1058 mutex_lock(&vcpu->kvm->lock);
1059 r = mmu_topup_memory_caches(vcpu);
1060 if (r)
1061 goto out;
1062 mmu_alloc_roots(vcpu);
1063 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1064 kvm_mmu_flush_tlb(vcpu);
1065out:
1066 mutex_unlock(&vcpu->kvm->lock);
1067 return r;
1068}
1069EXPORT_SYMBOL_GPL(kvm_mmu_load);
1070
1071void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1072{
1073 mmu_free_roots(vcpu);
1074}
1075
1076static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1077 struct kvm_mmu_page *page,
1078 u64 *spte)
1079{
1080 u64 pte;
1081 struct kvm_mmu_page *child;
1082
1083 pte = *spte;
1084 if (is_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte);
1087 else {
1088 child = page_header(pte & PT64_BASE_ADDR_MASK);
1089 mmu_page_remove_parent_pte(child, spte);
1090 }
1091 }
1092 set_shadow_pte(spte, 0);
1093 kvm_flush_remote_tlbs(vcpu->kvm);
1094}
1095
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page,
1098 u64 *spte,
1099 const void *new, int bytes)
1100{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return;
1103
1104 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes);
1106 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes);
1108}
1109
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1111 const u8 *new, int bytes)
1112{
1113 gfn_t gfn = gpa >> PAGE_SHIFT;
1114 struct kvm_mmu_page *page;
1115 struct hlist_node *node, *n;
1116 struct hlist_head *bucket;
1117 unsigned index;
1118 u64 *spte;
1119 unsigned offset = offset_in_page(gpa);
1120 unsigned pte_size;
1121 unsigned page_offset;
1122 unsigned misaligned;
1123 unsigned quadrant;
1124 int level;
1125 int flooded = 0;
1126 int npte;
1127
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1129 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3)
1132 flooded = 1;
1133 } else {
1134 vcpu->last_pt_write_gfn = gfn;
1135 vcpu->last_pt_write_count = 1;
1136 }
1137 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1138 bucket = &vcpu->kvm->mmu_page_hash[index];
1139 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1140 if (page->gfn != gfn || page->role.metaphysical)
1141 continue;
1142 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1143 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1144 misaligned |= bytes < 4;
1145 if (misaligned || flooded) {
1146 /*
1147 * Misaligned accesses are too much trouble to fix
1148 * up; also, they usually indicate a page is not used
1149 * as a page table.
1150 *
1151 * If we're seeing too many writes to a page,
1152 * it may no longer be a page table, or we may be
1153 * forking, in which case it is better to unmap the
1154 * page.
1155 */
1156 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1157 gpa, bytes, page->role.word);
1158 kvm_mmu_zap_page(vcpu->kvm, page);
1159 continue;
1160 }
1161 page_offset = offset;
1162 level = page->role.level;
1163 npte = 1;
1164 if (page->role.glevels == PT32_ROOT_LEVEL) {
1165 page_offset <<= 1; /* 32->64 */
1166 /*
1167 * A 32-bit pde maps 4MB while the shadow pdes map
1168 * only 2MB. So we need to double the offset again
1169 * and zap two pdes instead of one.
1170 */
1171 if (level == PT32_ROOT_LEVEL) {
1172 page_offset &= ~7; /* kill rounding error */
1173 page_offset <<= 1;
1174 npte = 2;
1175 }
1176 quadrant = page_offset >> PAGE_SHIFT;
1177 page_offset &= ~PAGE_MASK;
1178 if (quadrant != page->role.quadrant)
1179 continue;
1180 }
1181 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1185 ++spte;
1186 }
1187 }
1188}
1189
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1191{
1192 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1193
1194 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1195}
1196
1197void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1198{
1199 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1200 struct kvm_mmu_page *page;
1201
1202 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1203 struct kvm_mmu_page, link);
1204 kvm_mmu_zap_page(vcpu->kvm, page);
1205 }
1206}
1207
1208static void free_mmu_pages(struct kvm_vcpu *vcpu)
1209{
1210 struct kvm_mmu_page *page;
1211
1212 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1213 page = container_of(vcpu->kvm->active_mmu_pages.next,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu->kvm, page);
1216 }
1217 free_page((unsigned long)vcpu->mmu.pae_root);
1218}
1219
1220static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1221{
1222 struct page *page;
1223 int i;
1224
1225 ASSERT(vcpu);
1226
1227 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1228
1229 /*
1230 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1231 * Therefore we need to allocate shadow page tables in the first
1232 * 4GB of memory, which happens to fit the DMA32 zone.
1233 */
1234 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1235 if (!page)
1236 goto error_1;
1237 vcpu->mmu.pae_root = page_address(page);
1238 for (i = 0; i < 4; ++i)
1239 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1240
1241 return 0;
1242
1243error_1:
1244 free_mmu_pages(vcpu);
1245 return -ENOMEM;
1246}
1247
1248int kvm_mmu_create(struct kvm_vcpu *vcpu)
1249{
1250 ASSERT(vcpu);
1251 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1252
1253 return alloc_mmu_pages(vcpu);
1254}
1255
1256int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1257{
1258 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1260
1261 return init_kvm_mmu(vcpu);
1262}
1263
1264void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1265{
1266 ASSERT(vcpu);
1267
1268 destroy_kvm_mmu(vcpu);
1269 free_mmu_pages(vcpu);
1270 mmu_free_memory_caches(vcpu);
1271}
1272
1273void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1274{
1275 struct kvm_mmu_page *page;
1276
1277 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1278 int i;
1279 u64 *pt;
1280
1281 if (!test_bit(slot, &page->slot_bitmap))
1282 continue;
1283
1284 pt = page->spt;
1285 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1286 /* avoid RMW */
1287 if (pt[i] & PT_WRITABLE_MASK) {
1288 rmap_remove(&pt[i]);
1289 pt[i] &= ~PT_WRITABLE_MASK;
1290 }
1291 }
1292}
1293
1294void kvm_mmu_zap_all(struct kvm *kvm)
1295{
1296 struct kvm_mmu_page *page, *node;
1297
1298 list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1299 kvm_mmu_zap_page(kvm, page);
1300
1301 kvm_flush_remote_tlbs(kvm);
1302}
1303
1304void kvm_mmu_module_exit(void)
1305{
1306 if (pte_chain_cache)
1307 kmem_cache_destroy(pte_chain_cache);
1308 if (rmap_desc_cache)
1309 kmem_cache_destroy(rmap_desc_cache);
1310 if (mmu_page_header_cache)
1311 kmem_cache_destroy(mmu_page_header_cache);
1312}
1313
1314int kvm_mmu_module_init(void)
1315{
1316 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1317 sizeof(struct kvm_pte_chain),
1318 0, 0, NULL);
1319 if (!pte_chain_cache)
1320 goto nomem;
1321 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1322 sizeof(struct kvm_rmap_desc),
1323 0, 0, NULL);
1324 if (!rmap_desc_cache)
1325 goto nomem;
1326
1327 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1328 sizeof(struct kvm_mmu_page),
1329 0, 0, NULL);
1330 if (!mmu_page_header_cache)
1331 goto nomem;
1332
1333 return 0;
1334
1335nomem:
1336 kvm_mmu_module_exit();
1337 return -ENOMEM;
1338}
1339
1340#ifdef AUDIT
1341
1342static const char *audit_msg;
1343
1344static gva_t canonicalize(gva_t gva)
1345{
1346#ifdef CONFIG_X86_64
1347 gva = (long long)(gva << 16) >> 16;
1348#endif
1349 return gva;
1350}
1351
1352static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1353 gva_t va, int level)
1354{
1355 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1356 int i;
1357 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1358
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i];
1361
1362 if (!(ent & PT_PRESENT_MASK))
1363 continue;
1364
1365 va = canonicalize(va);
1366 if (level > 1)
1367 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371
1372 if ((ent & PT_PRESENT_MASK)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n",
1376 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent);
1378 }
1379 }
1380}
1381
1382static void audit_mappings(struct kvm_vcpu *vcpu)
1383{
1384 unsigned i;
1385
1386 if (vcpu->mmu.root_level == 4)
1387 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1388 else
1389 for (i = 0; i < 4; ++i)
1390 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1391 audit_mappings_page(vcpu,
1392 vcpu->mmu.pae_root[i],
1393 i << 30,
1394 2);
1395}
1396
1397static int count_rmaps(struct kvm_vcpu *vcpu)
1398{
1399 int nmaps = 0;
1400 int i, j, k;
1401
1402 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1403 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1404 struct kvm_rmap_desc *d;
1405
1406 for (j = 0; j < m->npages; ++j) {
1407 struct page *page = m->phys_mem[j];
1408
1409 if (!page->private)
1410 continue;
1411 if (!(page->private & 1)) {
1412 ++nmaps;
1413 continue;
1414 }
1415 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1416 while (d) {
1417 for (k = 0; k < RMAP_EXT; ++k)
1418 if (d->shadow_ptes[k])
1419 ++nmaps;
1420 else
1421 break;
1422 d = d->more;
1423 }
1424 }
1425 }
1426 return nmaps;
1427}
1428
1429static int count_writable_mappings(struct kvm_vcpu *vcpu)
1430{
1431 int nmaps = 0;
1432 struct kvm_mmu_page *page;
1433 int i;
1434
1435 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1436 u64 *pt = page->spt;
1437
1438 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1439 continue;
1440
1441 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1442 u64 ent = pt[i];
1443
1444 if (!(ent & PT_PRESENT_MASK))
1445 continue;
1446 if (!(ent & PT_WRITABLE_MASK))
1447 continue;
1448 ++nmaps;
1449 }
1450 }
1451 return nmaps;
1452}
1453
1454static void audit_rmap(struct kvm_vcpu *vcpu)
1455{
1456 int n_rmap = count_rmaps(vcpu);
1457 int n_actual = count_writable_mappings(vcpu);
1458
1459 if (n_rmap != n_actual)
1460 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1461 __FUNCTION__, audit_msg, n_rmap, n_actual);
1462}
1463
1464static void audit_write_protection(struct kvm_vcpu *vcpu)
1465{
1466 struct kvm_mmu_page *page;
1467
1468 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1469 hfn_t hfn;
1470 struct page *pg;
1471
1472 if (page->role.metaphysical)
1473 continue;
1474
1475 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1476 >> PAGE_SHIFT;
1477 pg = pfn_to_page(hfn);
1478 if (pg->private)
1479 printk(KERN_ERR "%s: (%s) shadow page has writable"
1480 " mappings: gfn %lx role %x\n",
1481 __FUNCTION__, audit_msg, page->gfn,
1482 page->role.word);
1483 }
1484}
1485
1486static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1487{
1488 int olddbg = dbg;
1489
1490 dbg = 0;
1491 audit_msg = msg;
1492 audit_rmap(vcpu);
1493 audit_write_protection(vcpu);
1494 audit_mappings(vcpu);
1495 dbg = olddbg;
1496}
1497
1498#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8fb..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4
36 #else
37 #define PT_MAX_FULL_LEVELS 2
38 #endif
39#elif PTTYPE == 32
40 #define pt_element_t u32
41 #define guest_walker guest_walker32
42 #define FNAME(name) paging##32_##name
43 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
44 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
48 #define PT_MAX_FULL_LEVELS 2
49#else
50 #error Invalid PTTYPE value
51#endif
52
53/*
54 * The guest_walker structure emulates the behavior of the hardware page
55 * table walker.
56 */
57struct guest_walker {
58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table;
61 pt_element_t pte;
62 pt_element_t *ptep;
63 struct page *page;
64 int index;
65 pt_element_t inherited_ar;
66 gfn_t gfn;
67 u32 error_code;
68};
69
70/*
71 * Fetch a guest pte for a guest virtual address
72 */
73static int FNAME(walk_addr)(struct guest_walker *walker,
74 struct kvm_vcpu *vcpu, gva_t addr,
75 int write_fault, int user_fault, int fetch_fault)
76{
77 hpa_t hpa;
78 struct kvm_memory_slot *slot;
79 pt_element_t *ptep;
80 pt_element_t root;
81 gfn_t table_gfn;
82
83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
84 walker->level = vcpu->mmu.root_level;
85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
88 root = vcpu->cr3;
89#if PTTYPE == 64
90 if (!is_long_mode(vcpu)) {
91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
92 root = *walker->ptep;
93 walker->pte = root;
94 if (!(root & PT_PRESENT_MASK))
95 goto not_present;
96 --walker->level;
97 }
98#endif
99 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
100 walker->table_gfn[walker->level - 1] = table_gfn;
101 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
102 walker->level - 1, table_gfn);
103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
107
108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
110
111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
112
113 for (;;) {
114 int index = PT_INDEX(addr, walker->level);
115 hpa_t paddr;
116
117 ptep = &walker->table[index];
118 walker->index = index;
119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
120 ((unsigned long)ptep & PAGE_MASK));
121
122 if (!is_present_pte(*ptep))
123 goto not_present;
124
125 if (write_fault && !is_writeble_pte(*ptep))
126 if (user_fault || is_write_protection(vcpu))
127 goto access_error;
128
129 if (user_fault && !(*ptep & PT_USER_MASK))
130 goto access_error;
131
132#if PTTYPE == 64
133 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
134 goto access_error;
135#endif
136
137 if (!(*ptep & PT_ACCESSED_MASK)) {
138 mark_page_dirty(vcpu->kvm, table_gfn);
139 *ptep |= PT_ACCESSED_MASK;
140 }
141
142 if (walker->level == PT_PAGE_TABLE_LEVEL) {
143 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
144 >> PAGE_SHIFT;
145 break;
146 }
147
148 if (walker->level == PT_DIRECTORY_LEVEL
149 && (*ptep & PT_PAGE_SIZE_MASK)
150 && (PTTYPE == 64 || is_pse(vcpu))) {
151 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
152 >> PAGE_SHIFT;
153 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
154 break;
155 }
156
157 walker->inherited_ar &= walker->table[index];
158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
159 kunmap_atomic(walker->table, KM_USER0);
160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
163 --walker->level;
164 walker->table_gfn[walker->level - 1 ] = table_gfn;
165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
166 walker->level - 1, table_gfn);
167 }
168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
174 return 1;
175
176not_present:
177 walker->error_code = 0;
178 goto err;
179
180access_error:
181 walker->error_code = PFERR_PRESENT_MASK;
182
183err:
184 if (write_fault)
185 walker->error_code |= PFERR_WRITE_MASK;
186 if (user_fault)
187 walker->error_code |= PFERR_USER_MASK;
188 if (fetch_fault)
189 walker->error_code |= PFERR_FETCH_MASK;
190 if (walker->table)
191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
193}
194
195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
196 struct guest_walker *walker)
197{
198 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
199}
200
201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 u64 *shadow_pte,
203 gpa_t gaddr,
204 pt_element_t gpte,
205 u64 access_bits,
206 int user_fault,
207 int write_fault,
208 int *ptwrite,
209 struct guest_walker *walker,
210 gfn_t gfn)
211{
212 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte;
215 int was_rmapped = is_rmap_pte(spte);
216
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn);
221
222 if (write_fault && !dirty) {
223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
235 dirty = 1;
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 }
238
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK;
241 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK;
243
244 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
245
246 spte |= PT_PRESENT_MASK;
247 if (access_bits & PT_USER_MASK)
248 spte |= PT_USER_MASK;
249
250 if (is_error_hpa(paddr)) {
251 spte |= gaddr;
252 spte |= PT_SHADOW_IO_MARK;
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return;
256 }
257
258 spte |= paddr;
259
260 if ((access_bits & PT_WRITABLE_MASK)
261 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
262 struct kvm_mmu_page *shadow;
263
264 spte |= PT_WRITABLE_MASK;
265 if (user_fault) {
266 mmu_unshadow(vcpu, gfn);
267 goto unshadowed;
268 }
269
270 shadow = kvm_mmu_lookup_page(vcpu, gfn);
271 if (shadow) {
272 pgprintk("%s: found shadow page for %lx, marking ro\n",
273 __FUNCTION__, gfn);
274 access_bits &= ~PT_WRITABLE_MASK;
275 if (is_writeble_pte(spte)) {
276 spte &= ~PT_WRITABLE_MASK;
277 kvm_x86_ops->tlb_flush(vcpu);
278 }
279 if (write_fault)
280 *ptwrite = 1;
281 }
282 }
283
284unshadowed:
285
286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288
289 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped)
292 rmap_add(vcpu, shadow_pte);
293}
294
295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
296 u64 *shadow_pte, u64 access_bits,
297 int user_fault, int write_fault, int *ptwrite,
298 struct guest_walker *walker, gfn_t gfn)
299{
300 access_bits &= gpte;
301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
302 gpte, access_bits, user_fault, write_fault,
303 ptwrite, walker, gfn);
304}
305
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes)
308{
309 pt_element_t gpte;
310
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
315 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
318 0, NULL, NULL,
319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
320}
321
322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
323 u64 *shadow_pte, u64 access_bits,
324 int user_fault, int write_fault, int *ptwrite,
325 struct guest_walker *walker, gfn_t gfn)
326{
327 gpa_t gaddr;
328
329 access_bits &= gpde;
330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
331 if (PTTYPE == 32 && is_cpuid_PSE36())
332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
333 (32 - PT32_DIR_PSE36_SHIFT);
334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
335 gpde, access_bits, user_fault, write_fault,
336 ptwrite, walker, gfn);
337}
338
339/*
340 * Fetch a shadow pte for a specific level in the paging hierarchy.
341 */
342static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 struct guest_walker *walker,
344 int user_fault, int write_fault, int *ptwrite)
345{
346 hpa_t shadow_addr;
347 int level;
348 u64 *shadow_ent;
349 u64 *prev_shadow_ent = NULL;
350
351 if (!is_present_pte(walker->pte))
352 return NULL;
353
354 shadow_addr = vcpu->mmu.root_hpa;
355 level = vcpu->mmu.shadow_root_level;
356 if (level == PT32E_ROOT_LEVEL) {
357 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
358 shadow_addr &= PT64_BASE_ADDR_MASK;
359 --level;
360 }
361
362 for (; ; level--) {
363 u32 index = SHADOW_PT_INDEX(addr, level);
364 struct kvm_mmu_page *shadow_page;
365 u64 shadow_pte;
366 int metaphysical;
367 gfn_t table_gfn;
368 unsigned hugepage_access = 0;
369
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL)
373 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
375 prev_shadow_ent = shadow_ent;
376 continue;
377 }
378
379 if (level == PT_PAGE_TABLE_LEVEL)
380 break;
381
382 if (level - 1 == PT_PAGE_TABLE_LEVEL
383 && walker->level == PT_DIRECTORY_LEVEL) {
384 metaphysical = 1;
385 hugepage_access = walker->pte;
386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
387 if (walker->pte & PT64_NX_MASK)
388 hugepage_access |= (1 << 2);
389 hugepage_access >>= PT_WRITABLE_SHIFT;
390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
391 >> PAGE_SHIFT;
392 } else {
393 metaphysical = 0;
394 table_gfn = walker->table_gfn[level - 2];
395 }
396 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
397 metaphysical, hugepage_access,
398 shadow_ent);
399 shadow_addr = __pa(shadow_page->spt);
400 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
401 | PT_WRITABLE_MASK | PT_USER_MASK;
402 *shadow_ent = shadow_pte;
403 prev_shadow_ent = shadow_ent;
404 }
405
406 if (walker->level == PT_DIRECTORY_LEVEL) {
407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
408 walker->inherited_ar, user_fault, write_fault,
409 ptwrite, walker, walker->gfn);
410 } else {
411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
413 walker->inherited_ar, user_fault, write_fault,
414 ptwrite, walker, walker->gfn);
415 }
416 return shadow_ent;
417}
418
419/*
420 * Page fault handler. There are several causes for a page fault:
421 * - there is no shadow pte for the guest pte
422 * - write access through a shadow pte marked read only so that we can set
423 * the dirty bit
424 * - write access to a shadow pte marked read only so we can update the page
425 * dirty bitmap, when userspace requests it
426 * - mmio access; in this case we will never install a present shadow pte
427 * - normal guest page fault due to the guest pte marked not present, not
428 * writable, or not executable
429 *
430 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
431 * a negative value on error.
432 */
433static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
434 u32 error_code)
435{
436 int write_fault = error_code & PFERR_WRITE_MASK;
437 int user_fault = error_code & PFERR_USER_MASK;
438 int fetch_fault = error_code & PFERR_FETCH_MASK;
439 struct guest_walker walker;
440 u64 *shadow_pte;
441 int write_pt = 0;
442 int r;
443
444 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
445 kvm_mmu_audit(vcpu, "pre page fault");
446
447 r = mmu_topup_memory_caches(vcpu);
448 if (r)
449 return r;
450
451 /*
452 * Look up the shadow pte for the faulting address.
453 */
454 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
455 fetch_fault);
456
457 /*
458 * The page is not mapped by the guest. Let the guest handle it.
459 */
460 if (!r) {
461 pgprintk("%s: guest page fault\n", __FUNCTION__);
462 inject_page_fault(vcpu, addr, walker.error_code);
463 vcpu->last_pt_write_count = 0; /* reset fork detector */
464 return 0;
465 }
466
467 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
468 &write_pt);
469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
470 shadow_pte, *shadow_pte, write_pt);
471
472 if (!write_pt)
473 vcpu->last_pt_write_count = 0; /* reset fork detector */
474
475 /*
476 * mmio: emulate if accessible, otherwise its a guest fault.
477 */
478 if (is_io_pte(*shadow_pte))
479 return 1;
480
481 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)");
483
484 return write_pt;
485}
486
487static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
488{
489 struct guest_walker walker;
490 gpa_t gpa = UNMAPPED_GVA;
491 int r;
492
493 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
494
495 if (r) {
496 gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
497 gpa |= vaddr & ~PAGE_MASK;
498 }
499
500 return gpa;
501}
502
503#undef pt_element_t
504#undef guest_walker
505#undef FNAME
506#undef PT_BASE_ADDR_MASK
507#undef PT_INDEX
508#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK
511#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf891..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65
66static u8 opcode_table[256] = {
67 /* 0x00 - 0x07 */
68 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
69 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
70 0, 0, 0, 0,
71 /* 0x08 - 0x0F */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x10 - 0x17 */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x18 - 0x1F */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 0, 0, 0, 0,
91 /* 0x30 - 0x37 */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x38 - 0x3F */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */
102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
104 /* 0x58 - 0x5F */
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
107 /* 0x60 - 0x67 */
108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
109 0, 0, 0, 0,
110 /* 0x68 - 0x6F */
111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
120 /* 0x80 - 0x87 */
121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
124 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
125 /* 0x88 - 0x8F */
126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
129 /* 0x90 - 0x9F */
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
131 /* 0xA0 - 0xA7 */
132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
134 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
135 ByteOp | ImplicitOps, ImplicitOps,
136 /* 0xA8 - 0xAF */
137 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
138 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
139 ByteOp | ImplicitOps, ImplicitOps,
140 /* 0xB0 - 0xBF */
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 /* 0xC0 - 0xC7 */
143 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
144 0, ImplicitOps, 0, 0,
145 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
146 /* 0xC8 - 0xCF */
147 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xD0 - 0xD7 */
149 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
150 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
151 0, 0, 0, 0,
152 /* 0xD8 - 0xDF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xE0 - 0xE7 */
155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
158 /* 0xF0 - 0xF7 */
159 0, 0, 0, 0,
160 ImplicitOps, 0,
161 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
162 /* 0xF8 - 0xFF */
163 0, 0, 0, 0,
164 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
165};
166
167static u16 twobyte_table[256] = {
168 /* 0x00 - 0x0F */
169 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
170 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
171 /* 0x10 - 0x1F */
172 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
173 /* 0x20 - 0x2F */
174 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x30 - 0x3F */
177 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x40 - 0x47 */
179 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
180 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
181 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
182 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
183 /* 0x48 - 0x4F */
184 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
185 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 /* 0x50 - 0x5F */
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 0x60 - 0x6F */
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 0x70 - 0x7F */
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* 0x80 - 0x8F */
195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
199 /* 0x90 - 0x9F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xA0 - 0xA7 */
202 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
203 /* 0xA8 - 0xAF */
204 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
205 /* 0xB0 - 0xB7 */
206 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
207 DstMem | SrcReg | ModRM | BitOp,
208 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
209 DstReg | SrcMem16 | ModRM | Mov,
210 /* 0xB8 - 0xBF */
211 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
213 DstReg | SrcMem16 | ModRM | Mov,
214 /* 0xC0 - 0xCF */
215 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 /* 0xD0 - 0xDF */
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 /* 0xE0 - 0xEF */
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0xF0 - 0xFF */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* Type, address-of, and value of an instruction's operand. */
226struct operand {
227 enum { OP_REG, OP_MEM, OP_IMM } type;
228 unsigned int bytes;
229 unsigned long val, orig_val, *ptr;
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
265 "push %"_sav"; " \
266 "movl %"_msk",%"_LO32 _tmp"; " \
267 "andl %"_LO32 _tmp",("_STK"); " \
268 "pushf; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pop %"_tmp"; " \
272 "orl %"_LO32 _tmp",("_STK"); " \
273 "popf; " \
274 /* _sav &= ~msk; */ \
275 "movl %"_msk",%"_LO32 _tmp"; " \
276 "notl %"_LO32 _tmp"; " \
277 "andl %"_LO32 _tmp",%"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0","4","2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0","4","2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0","4","2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0","4","2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ( (_dst).bytes ) \
322 { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0","4","2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0","4","2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ( (_dst).bytes ) \
360 { \
361 case 1: \
362 __asm__ __volatile__ ( \
363 _PRE_EFLAGS("0","3","2") \
364 _op"b %1; " \
365 _POST_EFLAGS("0","3","2") \
366 : "=m" (_eflags), "=m" ((_dst).val), \
367 "=&r" (_tmp) \
368 : "i" (EFLAGS_MASK) ); \
369 break; \
370 case 2: \
371 __asm__ __volatile__ ( \
372 _PRE_EFLAGS("0","3","2") \
373 _op"w %1; " \
374 _POST_EFLAGS("0","3","2") \
375 : "=m" (_eflags), "=m" ((_dst).val), \
376 "=&r" (_tmp) \
377 : "i" (EFLAGS_MASK) ); \
378 break; \
379 case 4: \
380 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0","3","2") \
382 _op"l %1; " \
383 _POST_EFLAGS("0","3","2") \
384 : "=m" (_eflags), "=m" ((_dst).val), \
385 "=&r" (_tmp) \
386 : "i" (EFLAGS_MASK) ); \
387 break; \
388 case 8: \
389 __emulate_1op_8byte(_op, _dst, _eflags); \
390 break; \
391 } \
392 } while (0)
393
394/* Emulate an instruction with quadword operands (x86/64 only). */
395#if defined(CONFIG_X86_64)
396#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
397 do { \
398 __asm__ __volatile__ ( \
399 _PRE_EFLAGS("0","4","2") \
400 _op"q %"_qx"3,%1; " \
401 _POST_EFLAGS("0","4","2") \
402 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
403 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
404 } while (0)
405
406#define __emulate_1op_8byte(_op, _dst, _eflags) \
407 do { \
408 __asm__ __volatile__ ( \
409 _PRE_EFLAGS("0","3","2") \
410 _op"q %1; " \
411 _POST_EFLAGS("0","3","2") \
412 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
413 : "i" (EFLAGS_MASK) ); \
414 } while (0)
415
416#elif defined(__i386__)
417#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
418#define __emulate_1op_8byte(_op, _dst, _eflags)
419#endif /* __i386__ */
420
421/* Fetch next part of the instruction being emulated. */
422#define insn_fetch(_type, _size, _eip) \
423({ unsigned long _x; \
424 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
425 (_size), ctxt->vcpu); \
426 if ( rc != 0 ) \
427 goto done; \
428 (_eip) += (_size); \
429 (_type)_x; \
430})
431
432/* Access/update address held in a register, based on addressing mode. */
433#define address_mask(reg) \
434 ((ad_bytes == sizeof(unsigned long)) ? \
435 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
436#define register_address(base, reg) \
437 ((base) + address_mask(reg))
438#define register_address_increment(reg, inc) \
439 do { \
440 /* signed type ensures sign extension to long */ \
441 int _inc = (inc); \
442 if ( ad_bytes == sizeof(unsigned long) ) \
443 (reg) += _inc; \
444 else \
445 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
446 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(_eip, rel); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
461{
462 void *p;
463
464 p = &regs[modrm_reg];
465 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
466 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
467 return p;
468}
469
470static int read_descriptor(struct x86_emulate_ctxt *ctxt,
471 struct x86_emulate_ops *ops,
472 void *ptr,
473 u16 *size, unsigned long *address, int op_bytes)
474{
475 int rc;
476
477 if (op_bytes == 2)
478 op_bytes = 3;
479 *address = 0;
480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
482 if (rc)
483 return rc;
484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
486 return rc;
487}
488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
524int
525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
526{
527 unsigned d;
528 u8 b, sib, twobyte = 0, rex_prefix = 0;
529 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
530 unsigned long *override_base = NULL;
531 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
532 int rc = 0;
533 struct operand src, dst;
534 unsigned long cr2 = ctxt->cr2;
535 int mode = ctxt->mode;
536 unsigned long modrm_ea;
537 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
538 int no_wb = 0;
539 u64 msr_data;
540
541 /* Shadow copy of register state. Committed on successful emulation. */
542 unsigned long _regs[NR_VCPU_REGS];
543 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
544 unsigned long modrm_val = 0;
545
546 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
547
548 switch (mode) {
549 case X86EMUL_MODE_REAL:
550 case X86EMUL_MODE_PROT16:
551 op_bytes = ad_bytes = 2;
552 break;
553 case X86EMUL_MODE_PROT32:
554 op_bytes = ad_bytes = 4;
555 break;
556#ifdef CONFIG_X86_64
557 case X86EMUL_MODE_PROT64:
558 op_bytes = 4;
559 ad_bytes = 8;
560 break;
561#endif
562 default:
563 return -1;
564 }
565
566 /* Legacy prefixes. */
567 for (i = 0; i < 8; i++) {
568 switch (b = insn_fetch(u8, 1, _eip)) {
569 case 0x66: /* operand-size override */
570 op_bytes ^= 6; /* switch between 2/4 bytes */
571 break;
572 case 0x67: /* address-size override */
573 if (mode == X86EMUL_MODE_PROT64)
574 ad_bytes ^= 12; /* switch between 4/8 bytes */
575 else
576 ad_bytes ^= 6; /* switch between 2/4 bytes */
577 break;
578 case 0x2e: /* CS override */
579 override_base = &ctxt->cs_base;
580 break;
581 case 0x3e: /* DS override */
582 override_base = &ctxt->ds_base;
583 break;
584 case 0x26: /* ES override */
585 override_base = &ctxt->es_base;
586 break;
587 case 0x64: /* FS override */
588 override_base = &ctxt->fs_base;
589 break;
590 case 0x65: /* GS override */
591 override_base = &ctxt->gs_base;
592 break;
593 case 0x36: /* SS override */
594 override_base = &ctxt->ss_base;
595 break;
596 case 0xf0: /* LOCK */
597 lock_prefix = 1;
598 break;
599 case 0xf2: /* REPNE/REPNZ */
600 case 0xf3: /* REP/REPE/REPZ */
601 rep_prefix = 1;
602 break;
603 default:
604 goto done_prefixes;
605 }
606 }
607
608done_prefixes:
609
610 /* REX prefix. */
611 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
612 rex_prefix = b;
613 if (b & 8)
614 op_bytes = 8; /* REX.W */
615 modrm_reg = (b & 4) << 1; /* REX.R */
616 index_reg = (b & 2) << 2; /* REX.X */
617 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
618 b = insn_fetch(u8, 1, _eip);
619 }
620
621 /* Opcode byte(s). */
622 d = opcode_table[b];
623 if (d == 0) {
624 /* Two-byte opcode? */
625 if (b == 0x0f) {
626 twobyte = 1;
627 b = insn_fetch(u8, 1, _eip);
628 d = twobyte_table[b];
629 }
630
631 /* Unrecognised? */
632 if (d == 0)
633 goto cannot_emulate;
634 }
635
636 /* ModRM and SIB bytes. */
637 if (d & ModRM) {
638 modrm = insn_fetch(u8, 1, _eip);
639 modrm_mod |= (modrm & 0xc0) >> 6;
640 modrm_reg |= (modrm & 0x38) >> 3;
641 modrm_rm |= (modrm & 0x07);
642 modrm_ea = 0;
643 use_modrm_ea = 1;
644
645 if (modrm_mod == 3) {
646 modrm_val = *(unsigned long *)
647 decode_register(modrm_rm, _regs, d & ByteOp);
648 goto modrm_done;
649 }
650
651 if (ad_bytes == 2) {
652 unsigned bx = _regs[VCPU_REGS_RBX];
653 unsigned bp = _regs[VCPU_REGS_RBP];
654 unsigned si = _regs[VCPU_REGS_RSI];
655 unsigned di = _regs[VCPU_REGS_RDI];
656
657 /* 16-bit ModR/M decode. */
658 switch (modrm_mod) {
659 case 0:
660 if (modrm_rm == 6)
661 modrm_ea += insn_fetch(u16, 2, _eip);
662 break;
663 case 1:
664 modrm_ea += insn_fetch(s8, 1, _eip);
665 break;
666 case 2:
667 modrm_ea += insn_fetch(u16, 2, _eip);
668 break;
669 }
670 switch (modrm_rm) {
671 case 0:
672 modrm_ea += bx + si;
673 break;
674 case 1:
675 modrm_ea += bx + di;
676 break;
677 case 2:
678 modrm_ea += bp + si;
679 break;
680 case 3:
681 modrm_ea += bp + di;
682 break;
683 case 4:
684 modrm_ea += si;
685 break;
686 case 5:
687 modrm_ea += di;
688 break;
689 case 6:
690 if (modrm_mod != 0)
691 modrm_ea += bp;
692 break;
693 case 7:
694 modrm_ea += bx;
695 break;
696 }
697 if (modrm_rm == 2 || modrm_rm == 3 ||
698 (modrm_rm == 6 && modrm_mod != 0))
699 if (!override_base)
700 override_base = &ctxt->ss_base;
701 modrm_ea = (u16)modrm_ea;
702 } else {
703 /* 32/64-bit ModR/M decode. */
704 switch (modrm_rm) {
705 case 4:
706 case 12:
707 sib = insn_fetch(u8, 1, _eip);
708 index_reg |= (sib >> 3) & 7;
709 base_reg |= sib & 7;
710 scale = sib >> 6;
711
712 switch (base_reg) {
713 case 5:
714 if (modrm_mod != 0)
715 modrm_ea += _regs[base_reg];
716 else
717 modrm_ea += insn_fetch(s32, 4, _eip);
718 break;
719 default:
720 modrm_ea += _regs[base_reg];
721 }
722 switch (index_reg) {
723 case 4:
724 break;
725 default:
726 modrm_ea += _regs[index_reg] << scale;
727
728 }
729 break;
730 case 5:
731 if (modrm_mod != 0)
732 modrm_ea += _regs[modrm_rm];
733 else if (mode == X86EMUL_MODE_PROT64)
734 rip_relative = 1;
735 break;
736 default:
737 modrm_ea += _regs[modrm_rm];
738 break;
739 }
740 switch (modrm_mod) {
741 case 0:
742 if (modrm_rm == 5)
743 modrm_ea += insn_fetch(s32, 4, _eip);
744 break;
745 case 1:
746 modrm_ea += insn_fetch(s8, 1, _eip);
747 break;
748 case 2:
749 modrm_ea += insn_fetch(s32, 4, _eip);
750 break;
751 }
752 }
753 if (!override_base)
754 override_base = &ctxt->ds_base;
755 if (mode == X86EMUL_MODE_PROT64 &&
756 override_base != &ctxt->fs_base &&
757 override_base != &ctxt->gs_base)
758 override_base = NULL;
759
760 if (override_base)
761 modrm_ea += *override_base;
762
763 if (rip_relative) {
764 modrm_ea += _eip;
765 switch (d & SrcMask) {
766 case SrcImmByte:
767 modrm_ea += 1;
768 break;
769 case SrcImm:
770 if (d & ByteOp)
771 modrm_ea += 1;
772 else
773 if (op_bytes == 8)
774 modrm_ea += 4;
775 else
776 modrm_ea += op_bytes;
777 }
778 }
779 if (ad_bytes != 8)
780 modrm_ea = (u32)modrm_ea;
781 cr2 = modrm_ea;
782 modrm_done:
783 ;
784 }
785
786 /*
787 * Decode and fetch the source operand: register, memory
788 * or immediate.
789 */
790 switch (d & SrcMask) {
791 case SrcNone:
792 break;
793 case SrcReg:
794 src.type = OP_REG;
795 if (d & ByteOp) {
796 src.ptr = decode_register(modrm_reg, _regs,
797 (rex_prefix == 0));
798 src.val = src.orig_val = *(u8 *) src.ptr;
799 src.bytes = 1;
800 } else {
801 src.ptr = decode_register(modrm_reg, _regs, 0);
802 switch ((src.bytes = op_bytes)) {
803 case 2:
804 src.val = src.orig_val = *(u16 *) src.ptr;
805 break;
806 case 4:
807 src.val = src.orig_val = *(u32 *) src.ptr;
808 break;
809 case 8:
810 src.val = src.orig_val = *(u64 *) src.ptr;
811 break;
812 }
813 }
814 break;
815 case SrcMem16:
816 src.bytes = 2;
817 goto srcmem_common;
818 case SrcMem32:
819 src.bytes = 4;
820 goto srcmem_common;
821 case SrcMem:
822 src.bytes = (d & ByteOp) ? 1 : op_bytes;
823 /* Don't fetch the address for invlpg: it could be unmapped. */
824 if (twobyte && b == 0x01 && modrm_reg == 7)
825 break;
826 srcmem_common:
827 /*
828 * For instructions with a ModR/M byte, switch to register
829 * access if Mod = 3.
830 */
831 if ((d & ModRM) && modrm_mod == 3) {
832 src.type = OP_REG;
833 break;
834 }
835 src.type = OP_MEM;
836 src.ptr = (unsigned long *)cr2;
837 src.val = 0;
838 if ((rc = ops->read_emulated((unsigned long)src.ptr,
839 &src.val, src.bytes, ctxt->vcpu)) != 0)
840 goto done;
841 src.orig_val = src.val;
842 break;
843 case SrcImm:
844 src.type = OP_IMM;
845 src.ptr = (unsigned long *)_eip;
846 src.bytes = (d & ByteOp) ? 1 : op_bytes;
847 if (src.bytes == 8)
848 src.bytes = 4;
849 /* NB. Immediates are sign-extended as necessary. */
850 switch (src.bytes) {
851 case 1:
852 src.val = insn_fetch(s8, 1, _eip);
853 break;
854 case 2:
855 src.val = insn_fetch(s16, 2, _eip);
856 break;
857 case 4:
858 src.val = insn_fetch(s32, 4, _eip);
859 break;
860 }
861 break;
862 case SrcImmByte:
863 src.type = OP_IMM;
864 src.ptr = (unsigned long *)_eip;
865 src.bytes = 1;
866 src.val = insn_fetch(s8, 1, _eip);
867 break;
868 }
869
870 /* Decode and fetch the destination operand: register or memory. */
871 switch (d & DstMask) {
872 case ImplicitOps:
873 /* Special instructions do their own operand decoding. */
874 goto special_insn;
875 case DstReg:
876 dst.type = OP_REG;
877 if ((d & ByteOp)
878 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
879 dst.ptr = decode_register(modrm_reg, _regs,
880 (rex_prefix == 0));
881 dst.val = *(u8 *) dst.ptr;
882 dst.bytes = 1;
883 } else {
884 dst.ptr = decode_register(modrm_reg, _regs, 0);
885 switch ((dst.bytes = op_bytes)) {
886 case 2:
887 dst.val = *(u16 *)dst.ptr;
888 break;
889 case 4:
890 dst.val = *(u32 *)dst.ptr;
891 break;
892 case 8:
893 dst.val = *(u64 *)dst.ptr;
894 break;
895 }
896 }
897 break;
898 case DstMem:
899 dst.type = OP_MEM;
900 dst.ptr = (unsigned long *)cr2;
901 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
902 dst.val = 0;
903 /*
904 * For instructions with a ModR/M byte, switch to register
905 * access if Mod = 3.
906 */
907 if ((d & ModRM) && modrm_mod == 3) {
908 dst.type = OP_REG;
909 break;
910 }
911 if (d & BitOp) {
912 unsigned long mask = ~(dst.bytes * 8 - 1);
913
914 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
915 }
916 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
917 ((rc = ops->read_emulated((unsigned long)dst.ptr,
918 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
919 goto done;
920 break;
921 }
922 dst.orig_val = dst.val;
923
924 if (twobyte)
925 goto twobyte_insn;
926
927 switch (b) {
928 case 0x00 ... 0x05:
929 add: /* add */
930 emulate_2op_SrcV("add", src, dst, _eflags);
931 break;
932 case 0x08 ... 0x0d:
933 or: /* or */
934 emulate_2op_SrcV("or", src, dst, _eflags);
935 break;
936 case 0x10 ... 0x15:
937 adc: /* adc */
938 emulate_2op_SrcV("adc", src, dst, _eflags);
939 break;
940 case 0x18 ... 0x1d:
941 sbb: /* sbb */
942 emulate_2op_SrcV("sbb", src, dst, _eflags);
943 break;
944 case 0x20 ... 0x23:
945 and: /* and */
946 emulate_2op_SrcV("and", src, dst, _eflags);
947 break;
948 case 0x24: /* and al imm8 */
949 dst.type = OP_REG;
950 dst.ptr = &_regs[VCPU_REGS_RAX];
951 dst.val = *(u8 *)dst.ptr;
952 dst.bytes = 1;
953 dst.orig_val = dst.val;
954 goto and;
955 case 0x25: /* and ax imm16, or eax imm32 */
956 dst.type = OP_REG;
957 dst.bytes = op_bytes;
958 dst.ptr = &_regs[VCPU_REGS_RAX];
959 if (op_bytes == 2)
960 dst.val = *(u16 *)dst.ptr;
961 else
962 dst.val = *(u32 *)dst.ptr;
963 dst.orig_val = dst.val;
964 goto and;
965 case 0x28 ... 0x2d:
966 sub: /* sub */
967 emulate_2op_SrcV("sub", src, dst, _eflags);
968 break;
969 case 0x30 ... 0x35:
970 xor: /* xor */
971 emulate_2op_SrcV("xor", src, dst, _eflags);
972 break;
973 case 0x38 ... 0x3d:
974 cmp: /* cmp */
975 emulate_2op_SrcV("cmp", src, dst, _eflags);
976 break;
977 case 0x63: /* movsxd */
978 if (mode != X86EMUL_MODE_PROT64)
979 goto cannot_emulate;
980 dst.val = (s32) src.val;
981 break;
982 case 0x80 ... 0x83: /* Grp1 */
983 switch (modrm_reg) {
984 case 0:
985 goto add;
986 case 1:
987 goto or;
988 case 2:
989 goto adc;
990 case 3:
991 goto sbb;
992 case 4:
993 goto and;
994 case 5:
995 goto sub;
996 case 6:
997 goto xor;
998 case 7:
999 goto cmp;
1000 }
1001 break;
1002 case 0x84 ... 0x85:
1003 test: /* test */
1004 emulate_2op_SrcV("test", src, dst, _eflags);
1005 break;
1006 case 0x86 ... 0x87: /* xchg */
1007 /* Write back the register source. */
1008 switch (dst.bytes) {
1009 case 1:
1010 *(u8 *) src.ptr = (u8) dst.val;
1011 break;
1012 case 2:
1013 *(u16 *) src.ptr = (u16) dst.val;
1014 break;
1015 case 4:
1016 *src.ptr = (u32) dst.val;
1017 break; /* 64b reg: zero-extend */
1018 case 8:
1019 *src.ptr = dst.val;
1020 break;
1021 }
1022 /*
1023 * Write back the memory destination with implicit LOCK
1024 * prefix.
1025 */
1026 dst.val = src.val;
1027 lock_prefix = 1;
1028 break;
1029 case 0x88 ... 0x8b: /* mov */
1030 goto mov;
1031 case 0x8d: /* lea r16/r32, m */
1032 dst.val = modrm_val;
1033 break;
1034 case 0x8f: /* pop (sole member of Grp1a) */
1035 /* 64-bit mode: POP always pops a 64-bit operand. */
1036 if (mode == X86EMUL_MODE_PROT64)
1037 dst.bytes = 8;
1038 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1039 _regs[VCPU_REGS_RSP]),
1040 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1041 goto done;
1042 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
1043 break;
1044 case 0xa0 ... 0xa1: /* mov */
1045 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1046 dst.val = src.val;
1047 _eip += ad_bytes; /* skip src displacement */
1048 break;
1049 case 0xa2 ... 0xa3: /* mov */
1050 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1051 _eip += ad_bytes; /* skip dst displacement */
1052 break;
1053 case 0xc0 ... 0xc1:
1054 grp2: /* Grp2 */
1055 switch (modrm_reg) {
1056 case 0: /* rol */
1057 emulate_2op_SrcB("rol", src, dst, _eflags);
1058 break;
1059 case 1: /* ror */
1060 emulate_2op_SrcB("ror", src, dst, _eflags);
1061 break;
1062 case 2: /* rcl */
1063 emulate_2op_SrcB("rcl", src, dst, _eflags);
1064 break;
1065 case 3: /* rcr */
1066 emulate_2op_SrcB("rcr", src, dst, _eflags);
1067 break;
1068 case 4: /* sal/shl */
1069 case 6: /* sal/shl */
1070 emulate_2op_SrcB("sal", src, dst, _eflags);
1071 break;
1072 case 5: /* shr */
1073 emulate_2op_SrcB("shr", src, dst, _eflags);
1074 break;
1075 case 7: /* sar */
1076 emulate_2op_SrcB("sar", src, dst, _eflags);
1077 break;
1078 }
1079 break;
1080 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1081 mov:
1082 dst.val = src.val;
1083 break;
1084 case 0xd0 ... 0xd1: /* Grp2 */
1085 src.val = 1;
1086 goto grp2;
1087 case 0xd2 ... 0xd3: /* Grp2 */
1088 src.val = _regs[VCPU_REGS_RCX];
1089 goto grp2;
1090 case 0xf6 ... 0xf7: /* Grp3 */
1091 switch (modrm_reg) {
1092 case 0 ... 1: /* test */
1093 /*
1094 * Special case in Grp3: test has an immediate
1095 * source operand.
1096 */
1097 src.type = OP_IMM;
1098 src.ptr = (unsigned long *)_eip;
1099 src.bytes = (d & ByteOp) ? 1 : op_bytes;
1100 if (src.bytes == 8)
1101 src.bytes = 4;
1102 switch (src.bytes) {
1103 case 1:
1104 src.val = insn_fetch(s8, 1, _eip);
1105 break;
1106 case 2:
1107 src.val = insn_fetch(s16, 2, _eip);
1108 break;
1109 case 4:
1110 src.val = insn_fetch(s32, 4, _eip);
1111 break;
1112 }
1113 goto test;
1114 case 2: /* not */
1115 dst.val = ~dst.val;
1116 break;
1117 case 3: /* neg */
1118 emulate_1op("neg", dst, _eflags);
1119 break;
1120 default:
1121 goto cannot_emulate;
1122 }
1123 break;
1124 case 0xfe ... 0xff: /* Grp4/Grp5 */
1125 switch (modrm_reg) {
1126 case 0: /* inc */
1127 emulate_1op("inc", dst, _eflags);
1128 break;
1129 case 1: /* dec */
1130 emulate_1op("dec", dst, _eflags);
1131 break;
1132 case 4: /* jmp abs */
1133 if (b == 0xff)
1134 _eip = dst.val;
1135 else
1136 goto cannot_emulate;
1137 break;
1138 case 6: /* push */
1139 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1140 if (mode == X86EMUL_MODE_PROT64) {
1141 dst.bytes = 8;
1142 if ((rc = ops->read_std((unsigned long)dst.ptr,
1143 &dst.val, 8,
1144 ctxt->vcpu)) != 0)
1145 goto done;
1146 }
1147 register_address_increment(_regs[VCPU_REGS_RSP],
1148 -dst.bytes);
1149 if ((rc = ops->write_emulated(
1150 register_address(ctxt->ss_base,
1151 _regs[VCPU_REGS_RSP]),
1152 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1153 goto done;
1154 no_wb = 1;
1155 break;
1156 default:
1157 goto cannot_emulate;
1158 }
1159 break;
1160 }
1161
1162writeback:
1163 if (!no_wb) {
1164 switch (dst.type) {
1165 case OP_REG:
1166 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1167 switch (dst.bytes) {
1168 case 1:
1169 *(u8 *)dst.ptr = (u8)dst.val;
1170 break;
1171 case 2:
1172 *(u16 *)dst.ptr = (u16)dst.val;
1173 break;
1174 case 4:
1175 *dst.ptr = (u32)dst.val;
1176 break; /* 64b: zero-ext */
1177 case 8:
1178 *dst.ptr = dst.val;
1179 break;
1180 }
1181 break;
1182 case OP_MEM:
1183 if (lock_prefix)
1184 rc = ops->cmpxchg_emulated((unsigned long)dst.
1185 ptr, &dst.orig_val,
1186 &dst.val, dst.bytes,
1187 ctxt->vcpu);
1188 else
1189 rc = ops->write_emulated((unsigned long)dst.ptr,
1190 &dst.val, dst.bytes,
1191 ctxt->vcpu);
1192 if (rc != 0)
1193 goto done;
1194 default:
1195 break;
1196 }
1197 }
1198
1199 /* Commit shadow register state. */
1200 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1201 ctxt->eflags = _eflags;
1202 ctxt->vcpu->rip = _eip;
1203
1204done:
1205 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1206
1207special_insn:
1208 if (twobyte)
1209 goto twobyte_special_insn;
1210 switch(b) {
1211 case 0x50 ... 0x57: /* push reg */
1212 if (op_bytes == 2)
1213 src.val = (u16) _regs[b & 0x7];
1214 else
1215 src.val = (u32) _regs[b & 0x7];
1216 dst.type = OP_MEM;
1217 dst.bytes = op_bytes;
1218 dst.val = src.val;
1219 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1220 dst.ptr = (void *) register_address(
1221 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1222 break;
1223 case 0x58 ... 0x5f: /* pop reg */
1224 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1225 pop_instruction:
1226 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1227 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1228 != 0)
1229 goto done;
1230
1231 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1232 no_wb = 1; /* Disable writeback. */
1233 break;
1234 case 0x6a: /* push imm8 */
1235 src.val = 0L;
1236 src.val = insn_fetch(s8, 1, _eip);
1237 push:
1238 dst.type = OP_MEM;
1239 dst.bytes = op_bytes;
1240 dst.val = src.val;
1241 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1242 dst.ptr = (void *) register_address(ctxt->ss_base,
1243 _regs[VCPU_REGS_RSP]);
1244 break;
1245 case 0x6c: /* insb */
1246 case 0x6d: /* insw/insd */
1247 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1248 1, /* in */
1249 (d & ByteOp) ? 1 : op_bytes, /* size */
1250 rep_prefix ?
1251 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1252 (_eflags & EFLG_DF), /* down */
1253 register_address(ctxt->es_base,
1254 _regs[VCPU_REGS_RDI]), /* address */
1255 rep_prefix,
1256 _regs[VCPU_REGS_RDX] /* port */
1257 ) == 0)
1258 return -1;
1259 return 0;
1260 case 0x6e: /* outsb */
1261 case 0x6f: /* outsw/outsd */
1262 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1263 0, /* in */
1264 (d & ByteOp) ? 1 : op_bytes, /* size */
1265 rep_prefix ?
1266 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1267 (_eflags & EFLG_DF), /* down */
1268 register_address(override_base ?
1269 *override_base : ctxt->ds_base,
1270 _regs[VCPU_REGS_RSI]), /* address */
1271 rep_prefix,
1272 _regs[VCPU_REGS_RDX] /* port */
1273 ) == 0)
1274 return -1;
1275 return 0;
1276 case 0x70 ... 0x7f: /* jcc (short) */ {
1277 int rel = insn_fetch(s8, 1, _eip);
1278
1279 if (test_cc(b, _eflags))
1280 JMP_REL(rel);
1281 break;
1282 }
1283 case 0x9c: /* pushf */
1284 src.val = (unsigned long) _eflags;
1285 goto push;
1286 case 0x9d: /* popf */
1287 dst.ptr = (unsigned long *) &_eflags;
1288 goto pop_instruction;
1289 case 0xc3: /* ret */
1290 dst.ptr = &_eip;
1291 goto pop_instruction;
1292 case 0xf4: /* hlt */
1293 ctxt->vcpu->halt_request = 1;
1294 goto done;
1295 }
1296 if (rep_prefix) {
1297 if (_regs[VCPU_REGS_RCX] == 0) {
1298 ctxt->vcpu->rip = _eip;
1299 goto done;
1300 }
1301 _regs[VCPU_REGS_RCX]--;
1302 _eip = ctxt->vcpu->rip;
1303 }
1304 switch (b) {
1305 case 0xa4 ... 0xa5: /* movs */
1306 dst.type = OP_MEM;
1307 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1308 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1309 _regs[VCPU_REGS_RDI]);
1310 if ((rc = ops->read_emulated(register_address(
1311 override_base ? *override_base : ctxt->ds_base,
1312 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1313 goto done;
1314 register_address_increment(_regs[VCPU_REGS_RSI],
1315 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1316 register_address_increment(_regs[VCPU_REGS_RDI],
1317 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1318 break;
1319 case 0xa6 ... 0xa7: /* cmps */
1320 DPRINTF("Urk! I don't handle CMPS.\n");
1321 goto cannot_emulate;
1322 case 0xaa ... 0xab: /* stos */
1323 dst.type = OP_MEM;
1324 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1325 dst.ptr = (unsigned long *)cr2;
1326 dst.val = _regs[VCPU_REGS_RAX];
1327 register_address_increment(_regs[VCPU_REGS_RDI],
1328 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1329 break;
1330 case 0xac ... 0xad: /* lods */
1331 dst.type = OP_REG;
1332 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1333 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1334 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1335 ctxt->vcpu)) != 0)
1336 goto done;
1337 register_address_increment(_regs[VCPU_REGS_RSI],
1338 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1339 break;
1340 case 0xae ... 0xaf: /* scas */
1341 DPRINTF("Urk! I don't handle SCAS.\n");
1342 goto cannot_emulate;
1343 case 0xe8: /* call (near) */ {
1344 long int rel;
1345 switch (op_bytes) {
1346 case 2:
1347 rel = insn_fetch(s16, 2, _eip);
1348 break;
1349 case 4:
1350 rel = insn_fetch(s32, 4, _eip);
1351 break;
1352 case 8:
1353 rel = insn_fetch(s64, 8, _eip);
1354 break;
1355 default:
1356 DPRINTF("Call: Invalid op_bytes\n");
1357 goto cannot_emulate;
1358 }
1359 src.val = (unsigned long) _eip;
1360 JMP_REL(rel);
1361 op_bytes = ad_bytes;
1362 goto push;
1363 }
1364 case 0xe9: /* jmp rel */
1365 case 0xeb: /* jmp rel short */
1366 JMP_REL(src.val);
1367 no_wb = 1; /* Disable writeback. */
1368 break;
1369
1370
1371 }
1372 goto writeback;
1373
1374twobyte_insn:
1375 switch (b) {
1376 case 0x01: /* lgdt, lidt, lmsw */
1377 /* Disable writeback. */
1378 no_wb = 1;
1379 switch (modrm_reg) {
1380 u16 size;
1381 unsigned long address;
1382
1383 case 2: /* lgdt */
1384 rc = read_descriptor(ctxt, ops, src.ptr,
1385 &size, &address, op_bytes);
1386 if (rc)
1387 goto done;
1388 realmode_lgdt(ctxt->vcpu, size, address);
1389 break;
1390 case 3: /* lidt */
1391 rc = read_descriptor(ctxt, ops, src.ptr,
1392 &size, &address, op_bytes);
1393 if (rc)
1394 goto done;
1395 realmode_lidt(ctxt->vcpu, size, address);
1396 break;
1397 case 4: /* smsw */
1398 if (modrm_mod != 3)
1399 goto cannot_emulate;
1400 *(u16 *)&_regs[modrm_rm]
1401 = realmode_get_cr(ctxt->vcpu, 0);
1402 break;
1403 case 6: /* lmsw */
1404 if (modrm_mod != 3)
1405 goto cannot_emulate;
1406 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1407 break;
1408 case 7: /* invlpg*/
1409 emulate_invlpg(ctxt->vcpu, cr2);
1410 break;
1411 default:
1412 goto cannot_emulate;
1413 }
1414 break;
1415 case 0x21: /* mov from dr to reg */
1416 no_wb = 1;
1417 if (modrm_mod != 3)
1418 goto cannot_emulate;
1419 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1420 break;
1421 case 0x23: /* mov from reg to dr */
1422 no_wb = 1;
1423 if (modrm_mod != 3)
1424 goto cannot_emulate;
1425 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1426 break;
1427 case 0x40 ... 0x4f: /* cmov */
1428 dst.val = dst.orig_val = src.val;
1429 no_wb = 1;
1430 /*
1431 * First, assume we're decoding an even cmov opcode
1432 * (lsb == 0).
1433 */
1434 switch ((b & 15) >> 1) {
1435 case 0: /* cmovo */
1436 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1437 break;
1438 case 1: /* cmovb/cmovc/cmovnae */
1439 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1440 break;
1441 case 2: /* cmovz/cmove */
1442 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1443 break;
1444 case 3: /* cmovbe/cmovna */
1445 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1446 break;
1447 case 4: /* cmovs */
1448 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1449 break;
1450 case 5: /* cmovp/cmovpe */
1451 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1452 break;
1453 case 7: /* cmovle/cmovng */
1454 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1455 /* fall through */
1456 case 6: /* cmovl/cmovnge */
1457 no_wb &= (!(_eflags & EFLG_SF) !=
1458 !(_eflags & EFLG_OF)) ? 0 : 1;
1459 break;
1460 }
1461 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1462 no_wb ^= b & 1;
1463 break;
1464 case 0xa3:
1465 bt: /* bt */
1466 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1467 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1468 break;
1469 case 0xab:
1470 bts: /* bts */
1471 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1472 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1473 break;
1474 case 0xb0 ... 0xb1: /* cmpxchg */
1475 /*
1476 * Save real source value, then compare EAX against
1477 * destination.
1478 */
1479 src.orig_val = src.val;
1480 src.val = _regs[VCPU_REGS_RAX];
1481 emulate_2op_SrcV("cmp", src, dst, _eflags);
1482 if (_eflags & EFLG_ZF) {
1483 /* Success: write back to memory. */
1484 dst.val = src.orig_val;
1485 } else {
1486 /* Failure: write the value we saw to EAX. */
1487 dst.type = OP_REG;
1488 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1489 }
1490 break;
1491 case 0xb3:
1492 btr: /* btr */
1493 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1494 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1495 break;
1496 case 0xb6 ... 0xb7: /* movzx */
1497 dst.bytes = op_bytes;
1498 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1499 break;
1500 case 0xba: /* Grp8 */
1501 switch (modrm_reg & 3) {
1502 case 0:
1503 goto bt;
1504 case 1:
1505 goto bts;
1506 case 2:
1507 goto btr;
1508 case 3:
1509 goto btc;
1510 }
1511 break;
1512 case 0xbb:
1513 btc: /* btc */
1514 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1515 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1516 break;
1517 case 0xbe ... 0xbf: /* movsx */
1518 dst.bytes = op_bytes;
1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1520 break;
1521 case 0xc3: /* movnti */
1522 dst.bytes = op_bytes;
1523 dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
1524 break;
1525 }
1526 goto writeback;
1527
1528twobyte_special_insn:
1529 /* Disable writeback. */
1530 no_wb = 1;
1531 switch (b) {
1532 case 0x06:
1533 emulate_clts(ctxt->vcpu);
1534 break;
1535 case 0x08: /* invd */
1536 break;
1537 case 0x09: /* wbinvd */
1538 break;
1539 case 0x0d: /* GrpP (prefetch) */
1540 case 0x18: /* Grp16 (prefetch/nop) */
1541 break;
1542 case 0x20: /* mov cr, reg */
1543 if (modrm_mod != 3)
1544 goto cannot_emulate;
1545 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1546 break;
1547 case 0x22: /* mov reg, cr */
1548 if (modrm_mod != 3)
1549 goto cannot_emulate;
1550 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1551 break;
1552 case 0x30:
1553 /* wrmsr */
1554 msr_data = (u32)_regs[VCPU_REGS_RAX]
1555 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1556 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1557 if (rc) {
1558 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1559 _eip = ctxt->vcpu->rip;
1560 }
1561 rc = X86EMUL_CONTINUE;
1562 break;
1563 case 0x32:
1564 /* rdmsr */
1565 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1566 if (rc) {
1567 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1568 _eip = ctxt->vcpu->rip;
1569 } else {
1570 _regs[VCPU_REGS_RAX] = (u32)msr_data;
1571 _regs[VCPU_REGS_RDX] = msr_data >> 32;
1572 }
1573 rc = X86EMUL_CONTINUE;
1574 break;
1575 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1576 long int rel;
1577
1578 switch (op_bytes) {
1579 case 2:
1580 rel = insn_fetch(s16, 2, _eip);
1581 break;
1582 case 4:
1583 rel = insn_fetch(s32, 4, _eip);
1584 break;
1585 case 8:
1586 rel = insn_fetch(s64, 8, _eip);
1587 break;
1588 default:
1589 DPRINTF("jnz: Invalid op_bytes\n");
1590 goto cannot_emulate;
1591 }
1592 if (test_cc(b, _eflags))
1593 JMP_REL(rel);
1594 break;
1595 }
1596 case 0xc7: /* Grp9 (cmpxchg8b) */
1597 {
1598 u64 old, new;
1599 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1600 != 0)
1601 goto done;
1602 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1603 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1604 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1605 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1606 _eflags &= ~EFLG_ZF;
1607 } else {
1608 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1609 | (u32) _regs[VCPU_REGS_RBX];
1610 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1611 &new, 8, ctxt->vcpu)) != 0)
1612 goto done;
1613 _eflags |= EFLG_ZF;
1614 }
1615 break;
1616 }
1617 }
1618 goto writeback;
1619
1620cannot_emulate:
1621 DPRINTF("Cannot emulate %02x\n", b);
1622 return -1;
1623}
1624
1625#ifdef __XEN__
1626
1627#include <asm/mm.h>
1628#include <asm/uaccess.h>
1629
1630int
1631x86_emulate_read_std(unsigned long addr,
1632 unsigned long *val,
1633 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1634{
1635 unsigned int rc;
1636
1637 *val = 0;
1638
1639 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1640 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1641 return X86EMUL_PROPAGATE_FAULT;
1642 }
1643
1644 return X86EMUL_CONTINUE;
1645}
1646
1647int
1648x86_emulate_write_std(unsigned long addr,
1649 unsigned long val,
1650 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1651{
1652 unsigned int rc;
1653
1654 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1655 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1656 return X86EMUL_PROPAGATE_FAULT;
1657 }
1658
1659 return X86EMUL_CONTINUE;
1660}
1661
1662#endif
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index e9c3ba8aa1ec..61f2f8eb8cad 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -418,7 +418,7 @@ void __init lguest_arch_host_init(void)
418 /* We know where we want the stack to be when the Guest enters 418 /* We know where we want the stack to be when the Guest enters
419 * the switcher: in pages->regs. The stack grows upwards, so 419 * the switcher: in pages->regs. The stack grows upwards, so
420 * we start it at the end of that structure. */ 420 * we start it at the end of that structure. */
421 state->guest_tss.esp0 = (long)(&pages->regs + 1); 421 state->guest_tss.sp0 = (long)(&pages->regs + 1);
422 /* And this is the GDT entry to use for the stack: we keep a 422 /* And this is the GDT entry to use for the stack: we keep a
423 * couple of special LGUEST entries. */ 423 * couple of special LGUEST entries. */
424 state->guest_tss.ss0 = LGUEST_DS; 424 state->guest_tss.ss0 = LGUEST_DS;
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 5a2d1dd487f6..6c575403bd39 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1992,7 +1992,7 @@ config E1000_DISABLE_PACKET_SPLIT
1992 1992
1993config E1000E 1993config E1000E
1994 tristate "Intel(R) PRO/1000 PCI-Express Gigabit Ethernet support" 1994 tristate "Intel(R) PRO/1000 PCI-Express Gigabit Ethernet support"
1995 depends on PCI && EXPERIMENTAL 1995 depends on PCI
1996 ---help--- 1996 ---help---
1997 This driver supports the PCI-Express Intel(R) PRO/1000 gigabit 1997 This driver supports the PCI-Express Intel(R) PRO/1000 gigabit
1998 ethernet family of adapters. For PCI or PCI-X e1000 adapters, 1998 ethernet family of adapters. For PCI or PCI-X e1000 adapters,
@@ -2009,6 +2009,9 @@ config E1000E
2009 To compile this driver as a module, choose M here. The module 2009 To compile this driver as a module, choose M here. The module
2010 will be called e1000e. 2010 will be called e1000e.
2011 2011
2012config E1000E_ENABLED
2013 def_bool E1000E != n
2014
2012config IP1000 2015config IP1000
2013 tristate "IP1000 Gigabit Ethernet support" 2016 tristate "IP1000 Gigabit Ethernet support"
2014 depends on PCI && EXPERIMENTAL 2017 depends on PCI && EXPERIMENTAL
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 3111af6cdc8a..8c87940a9ce8 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -47,6 +47,12 @@ static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation
47 * Macro expands to... 47 * Macro expands to...
48 * {PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)} 48 * {PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)}
49 */ 49 */
50#ifdef CONFIG_E1000E_ENABLED
51 #define PCIE(x)
52#else
53 #define PCIE(x) x,
54#endif
55
50static struct pci_device_id e1000_pci_tbl[] = { 56static struct pci_device_id e1000_pci_tbl[] = {
51 INTEL_E1000_ETHERNET_DEVICE(0x1000), 57 INTEL_E1000_ETHERNET_DEVICE(0x1000),
52 INTEL_E1000_ETHERNET_DEVICE(0x1001), 58 INTEL_E1000_ETHERNET_DEVICE(0x1001),
@@ -73,14 +79,14 @@ static struct pci_device_id e1000_pci_tbl[] = {
73 INTEL_E1000_ETHERNET_DEVICE(0x1026), 79 INTEL_E1000_ETHERNET_DEVICE(0x1026),
74 INTEL_E1000_ETHERNET_DEVICE(0x1027), 80 INTEL_E1000_ETHERNET_DEVICE(0x1027),
75 INTEL_E1000_ETHERNET_DEVICE(0x1028), 81 INTEL_E1000_ETHERNET_DEVICE(0x1028),
76 INTEL_E1000_ETHERNET_DEVICE(0x1049), 82PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1049))
77 INTEL_E1000_ETHERNET_DEVICE(0x104A), 83PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104A))
78 INTEL_E1000_ETHERNET_DEVICE(0x104B), 84PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104B))
79 INTEL_E1000_ETHERNET_DEVICE(0x104C), 85PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104C))
80 INTEL_E1000_ETHERNET_DEVICE(0x104D), 86PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104D))
81 INTEL_E1000_ETHERNET_DEVICE(0x105E), 87PCIE( INTEL_E1000_ETHERNET_DEVICE(0x105E))
82 INTEL_E1000_ETHERNET_DEVICE(0x105F), 88PCIE( INTEL_E1000_ETHERNET_DEVICE(0x105F))
83 INTEL_E1000_ETHERNET_DEVICE(0x1060), 89PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1060))
84 INTEL_E1000_ETHERNET_DEVICE(0x1075), 90 INTEL_E1000_ETHERNET_DEVICE(0x1075),
85 INTEL_E1000_ETHERNET_DEVICE(0x1076), 91 INTEL_E1000_ETHERNET_DEVICE(0x1076),
86 INTEL_E1000_ETHERNET_DEVICE(0x1077), 92 INTEL_E1000_ETHERNET_DEVICE(0x1077),
@@ -89,28 +95,28 @@ static struct pci_device_id e1000_pci_tbl[] = {
89 INTEL_E1000_ETHERNET_DEVICE(0x107A), 95 INTEL_E1000_ETHERNET_DEVICE(0x107A),
90 INTEL_E1000_ETHERNET_DEVICE(0x107B), 96 INTEL_E1000_ETHERNET_DEVICE(0x107B),
91 INTEL_E1000_ETHERNET_DEVICE(0x107C), 97 INTEL_E1000_ETHERNET_DEVICE(0x107C),
92 INTEL_E1000_ETHERNET_DEVICE(0x107D), 98PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107D))
93 INTEL_E1000_ETHERNET_DEVICE(0x107E), 99PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107E))
94 INTEL_E1000_ETHERNET_DEVICE(0x107F), 100PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107F))
95 INTEL_E1000_ETHERNET_DEVICE(0x108A), 101 INTEL_E1000_ETHERNET_DEVICE(0x108A),
96 INTEL_E1000_ETHERNET_DEVICE(0x108B), 102PCIE( INTEL_E1000_ETHERNET_DEVICE(0x108B))
97 INTEL_E1000_ETHERNET_DEVICE(0x108C), 103PCIE( INTEL_E1000_ETHERNET_DEVICE(0x108C))
98 INTEL_E1000_ETHERNET_DEVICE(0x1096), 104PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1096))
99 INTEL_E1000_ETHERNET_DEVICE(0x1098), 105PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1098))
100 INTEL_E1000_ETHERNET_DEVICE(0x1099), 106 INTEL_E1000_ETHERNET_DEVICE(0x1099),
101 INTEL_E1000_ETHERNET_DEVICE(0x109A), 107PCIE( INTEL_E1000_ETHERNET_DEVICE(0x109A))
102 INTEL_E1000_ETHERNET_DEVICE(0x10A4), 108PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10A4))
103 INTEL_E1000_ETHERNET_DEVICE(0x10A5), 109PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10A5))
104 INTEL_E1000_ETHERNET_DEVICE(0x10B5), 110 INTEL_E1000_ETHERNET_DEVICE(0x10B5),
105 INTEL_E1000_ETHERNET_DEVICE(0x10B9), 111PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10B9))
106 INTEL_E1000_ETHERNET_DEVICE(0x10BA), 112PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BA))
107 INTEL_E1000_ETHERNET_DEVICE(0x10BB), 113PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BB))
108 INTEL_E1000_ETHERNET_DEVICE(0x10BC), 114PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BC))
109 INTEL_E1000_ETHERNET_DEVICE(0x10C4), 115PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10C4))
110 INTEL_E1000_ETHERNET_DEVICE(0x10C5), 116PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10C5))
111 INTEL_E1000_ETHERNET_DEVICE(0x10D5), 117PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10D5))
112 INTEL_E1000_ETHERNET_DEVICE(0x10D9), 118PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10D9))
113 INTEL_E1000_ETHERNET_DEVICE(0x10DA), 119PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10DA))
114 /* required last entry */ 120 /* required last entry */
115 {0,} 121 {0,}
116}; 122};
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 5dba68fe33f5..a8364d815222 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -61,7 +61,7 @@ set_base(gdt[(selname) >> 3], (u32)(address)); \
61set_limit(gdt[(selname) >> 3], size); \ 61set_limit(gdt[(selname) >> 3], size); \
62} while(0) 62} while(0)
63 63
64static struct desc_struct bad_bios_desc = { 0, 0x00409200 }; 64static struct desc_struct bad_bios_desc;
65 65
66/* 66/*
67 * At some point we want to use this stack frame pointer to unwind 67 * At some point we want to use this stack frame pointer to unwind
@@ -477,6 +477,9 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
477 pnp_bios_callpoint.offset = header->fields.pm16offset; 477 pnp_bios_callpoint.offset = header->fields.pm16offset;
478 pnp_bios_callpoint.segment = PNP_CS16; 478 pnp_bios_callpoint.segment = PNP_CS16;
479 479
480 bad_bios_desc.a = 0;
481 bad_bios_desc.b = 0x00409200;
482
480 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); 483 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
481 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); 484 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
482 for (i = 0; i < NR_CPUS; i++) { 485 for (i = 0; i < NR_CPUS; i++) {
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index e45f85f7c7ed..0dff05840ee2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
4224 4224
4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", 4225 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
4226 fcp_rsp_iu->fcp_sns_len); 4226 fcp_rsp_iu->fcp_sns_len);
4227 memcpy(&scpnt->sense_buffer, 4227 memcpy(scpnt->sense_buffer,
4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); 4228 zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, 4229 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
4230 (void *) &scpnt->sense_buffer, sns_len); 4230 (void *)scpnt->sense_buffer, sns_len);
4231 } 4231 }
4232 4232
4233 /* check for overrun */ 4233 /* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 1c244832c6c8..b4912d1cee2a 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = {
1990 .max_sectors = TW_MAX_SECTORS, 1990 .max_sectors = TW_MAX_SECTORS,
1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 1991 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
1992 .use_clustering = ENABLE_CLUSTERING, 1992 .use_clustering = ENABLE_CLUSTERING,
1993 .use_sg_chaining = ENABLE_SG_CHAINING,
1994 .shost_attrs = twa_host_attrs, 1993 .shost_attrs = twa_host_attrs,
1995 .emulated = 1 1994 .emulated = 1
1996}; 1995};
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 59716ebeb10c..d09532162217 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = {
2261 .max_sectors = TW_MAX_SECTORS, 2261 .max_sectors = TW_MAX_SECTORS,
2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN, 2262 .cmd_per_lun = TW_MAX_CMDS_PER_LUN,
2263 .use_clustering = ENABLE_CLUSTERING, 2263 .use_clustering = ENABLE_CLUSTERING,
2264 .use_sg_chaining = ENABLE_SG_CHAINING,
2265 .shost_attrs = tw_host_attrs, 2264 .shost_attrs = tw_host_attrs,
2266 .emulated = 1 2265 .emulated = 1
2267}; 2266};
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index ead47c143ce0..4d3ebb1af490 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = {
3575 .unchecked_isa_dma = 1, 3575 .unchecked_isa_dma = 1,
3576 .max_sectors = 128, 3576 .max_sectors = 128,
3577 .use_clustering = ENABLE_CLUSTERING, 3577 .use_clustering = ENABLE_CLUSTERING,
3578 .use_sg_chaining = ENABLE_SG_CHAINING,
3579}; 3578};
3580 3579
3581/* 3580/*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e161cd66463..14fc7f39e83e 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
345 345
346config SGIWD93_SCSI 346config SGIWD93_SCSI
347 tristate "SGI WD93C93 SCSI Driver" 347 tristate "SGI WD93C93 SCSI Driver"
348 depends on SGI_IP22 && SCSI 348 depends on SGI_HAS_WD93 && SCSI
349 help 349 help
350 If you have a Western Digital WD93 SCSI controller on 350 If you have a Western Digital WD93 SCSI controller on
351 an SGI MIPS system, say Y. Otherwise, say N. 351 an SGI MIPS system, say Y. Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 137d065db3da..6961f78742ae 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template =
1065 .cmd_per_lun = 1 /* commands per lun */, 1065 .cmd_per_lun = 1 /* commands per lun */,
1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */, 1066 .unchecked_isa_dma = 1 /* unchecked_isa_dma */,
1067 .use_clustering = ENABLE_CLUSTERING, 1067 .use_clustering = ENABLE_CLUSTERING,
1068 .use_sg_chaining = ENABLE_SG_CHAINING,
1069}; 1068};
1070 1069
1071#include "scsi_module.c" 1070#include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index d3a6d15fb77a..f608d4a1d6da 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = {
1071 .sg_tablesize = SG_ALL, 1071 .sg_tablesize = SG_ALL,
1072 .cmd_per_lun = 1, 1072 .cmd_per_lun = 1,
1073 .use_clustering = ENABLE_CLUSTERING, 1073 .use_clustering = ENABLE_CLUSTERING,
1074 .use_sg_chaining = ENABLE_SG_CHAINING,
1075}; 1074};
1076 1075
1077static int __devinit inia100_probe_one(struct pci_dev *pdev, 1076static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 851a7e599c50..f8afa358b6b6 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
243 * Search the list of AdapterFibContext addresses on the adapter 243 * Search the list of AdapterFibContext addresses on the adapter
244 * to be sure this is a valid address 244 * to be sure this is a valid address
245 */ 245 */
246 spin_lock_irqsave(&dev->fib_lock, flags);
247 entry = dev->fib_list.next; 246 entry = dev->fib_list.next;
248 fibctx = NULL; 247 fibctx = NULL;
249 248
@@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
252 /* 251 /*
253 * Extract the AdapterFibContext from the Input parameters. 252 * Extract the AdapterFibContext from the Input parameters.
254 */ 253 */
255 if (fibctx->unique == f.fibctx) { /* We found a winner */ 254 if (fibctx->unique == f.fibctx) { /* We found a winner */
256 break; 255 break;
257 } 256 }
258 entry = entry->next; 257 entry = entry->next;
259 fibctx = NULL; 258 fibctx = NULL;
260 } 259 }
261 if (!fibctx) { 260 if (!fibctx) {
262 spin_unlock_irqrestore(&dev->fib_lock, flags);
263 dprintk ((KERN_INFO "Fib Context not found\n")); 261 dprintk ((KERN_INFO "Fib Context not found\n"));
264 return -EINVAL; 262 return -EINVAL;
265 } 263 }
266 264
267 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || 265 if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
268 (fibctx->size != sizeof(struct aac_fib_context))) { 266 (fibctx->size != sizeof(struct aac_fib_context))) {
269 spin_unlock_irqrestore(&dev->fib_lock, flags);
270 dprintk ((KERN_INFO "Fib Context corrupt?\n")); 267 dprintk ((KERN_INFO "Fib Context corrupt?\n"));
271 return -EINVAL; 268 return -EINVAL;
272 } 269 }
273 status = 0; 270 status = 0;
271 spin_lock_irqsave(&dev->fib_lock, flags);
274 /* 272 /*
275 * If there are no fibs to send back, then either wait or return 273 * If there are no fibs to send back, then either wait or return
276 * -EAGAIN 274 * -EAGAIN
@@ -328,9 +326,7 @@ return_fib:
328int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) 326int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
329{ 327{
330 struct fib *fib; 328 struct fib *fib;
331 unsigned long flags;
332 329
333 spin_lock_irqsave(&dev->fib_lock, flags);
334 /* 330 /*
335 * First free any FIBs that have not been consumed. 331 * First free any FIBs that have not been consumed.
336 */ 332 */
@@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
353 * Remove the Context from the AdapterFibContext List 349 * Remove the Context from the AdapterFibContext List
354 */ 350 */
355 list_del(&fibctx->next); 351 list_del(&fibctx->next);
356 spin_unlock_irqrestore(&dev->fib_lock, flags);
357 /* 352 /*
358 * Invalidate context 353 * Invalidate context
359 */ 354 */
@@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
419 * @arg: ioctl arguments 414 * @arg: ioctl arguments
420 * 415 *
421 * This routine returns the driver version. 416 * This routine returns the driver version.
422 * Under Linux, there have been no version incompatibilities, so this is 417 * Under Linux, there have been no version incompatibilities, so this is
423 * simple! 418 * simple!
424 */ 419 */
425 420
426static int check_revision(struct aac_dev *dev, void __user *arg) 421static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
468 u32 data_dir; 463 u32 data_dir;
469 void __user *sg_user[32]; 464 void __user *sg_user[32];
470 void *sg_list[32]; 465 void *sg_list[32];
471 u32 sg_indx = 0; 466 u32 sg_indx = 0;
472 u32 byte_count = 0; 467 u32 byte_count = 0;
473 u32 actual_fibsize64, actual_fibsize = 0; 468 u32 actual_fibsize64, actual_fibsize = 0;
474 int i; 469 int i;
@@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
522 // Fix up srb for endian and force some values 517 // Fix up srb for endian and force some values
523 518
524 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this 519 srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this
525 srbcmd->channel = cpu_to_le32(user_srbcmd->channel); 520 srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
526 srbcmd->id = cpu_to_le32(user_srbcmd->id); 521 srbcmd->id = cpu_to_le32(user_srbcmd->id);
527 srbcmd->lun = cpu_to_le32(user_srbcmd->lun); 522 srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
528 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); 523 srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
529 srbcmd->flags = cpu_to_le32(flags); 524 srbcmd->flags = cpu_to_le32(flags);
530 srbcmd->retry_limit = 0; // Obsolete parameter 525 srbcmd->retry_limit = 0; // Obsolete parameter
531 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); 526 srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
532 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); 527 memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
791 pci_info.bus = dev->pdev->bus->number; 786 pci_info.bus = dev->pdev->bus->number;
792 pci_info.slot = PCI_SLOT(dev->pdev->devfn); 787 pci_info.slot = PCI_SLOT(dev->pdev->devfn);
793 788
794 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { 789 if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
795 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); 790 dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
796 return -EFAULT; 791 return -EFAULT;
797 } 792 }
798 return 0; 793 return 0;
799} 794}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 61be22774e99..0e8267c1e915 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = {
1032 .cmd_per_lun = AAC_NUM_IO_FIB, 1032 .cmd_per_lun = AAC_NUM_IO_FIB,
1033#endif 1033#endif
1034 .use_clustering = ENABLE_CLUSTERING, 1034 .use_clustering = ENABLE_CLUSTERING,
1035 .use_sg_chaining = ENABLE_SG_CHAINING,
1036 .emulated = 1, 1035 .emulated = 1,
1037}; 1036};
1038 1037
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be58a0b097c7..7c45d88a205b 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = {
563 .sg_tablesize = AHA1740_SCATTER, 563 .sg_tablesize = AHA1740_SCATTER,
564 .cmd_per_lun = AHA1740_CMDLUN, 564 .cmd_per_lun = AHA1740_CMDLUN,
565 .use_clustering = ENABLE_CLUSTERING, 565 .use_clustering = ENABLE_CLUSTERING,
566 .use_sg_chaining = ENABLE_SG_CHAINING,
567 .eh_abort_handler = aha1740_eh_abort_handler, 566 .eh_abort_handler = aha1740_eh_abort_handler,
568}; 567};
569 568
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index ce638aa6005a..2f00467b6b8c 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
1340int ahd_pci_config(struct ahd_softc *, 1340int ahd_pci_config(struct ahd_softc *,
1341 struct ahd_pci_identity *); 1341 struct ahd_pci_identity *);
1342int ahd_pci_test_register_access(struct ahd_softc *); 1342int ahd_pci_test_register_access(struct ahd_softc *);
1343#ifdef CONFIG_PM
1343void ahd_pci_suspend(struct ahd_softc *); 1344void ahd_pci_suspend(struct ahd_softc *);
1344void ahd_pci_resume(struct ahd_softc *); 1345void ahd_pci_resume(struct ahd_softc *);
1346#endif
1345 1347
1346/************************** SCB and SCB queue management **********************/ 1348/************************** SCB and SCB queue management **********************/
1347void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, 1349void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name);
1352int ahd_softc_init(struct ahd_softc *); 1354int ahd_softc_init(struct ahd_softc *);
1353void ahd_controller_info(struct ahd_softc *ahd, char *buf); 1355void ahd_controller_info(struct ahd_softc *ahd, char *buf);
1354int ahd_init(struct ahd_softc *ahd); 1356int ahd_init(struct ahd_softc *ahd);
1357#ifdef CONFIG_PM
1355int ahd_suspend(struct ahd_softc *ahd); 1358int ahd_suspend(struct ahd_softc *ahd);
1356void ahd_resume(struct ahd_softc *ahd); 1359void ahd_resume(struct ahd_softc *ahd);
1360#endif
1357int ahd_default_config(struct ahd_softc *ahd); 1361int ahd_default_config(struct ahd_softc *ahd);
1358int ahd_parse_vpddata(struct ahd_softc *ahd, 1362int ahd_parse_vpddata(struct ahd_softc *ahd,
1359 struct vpd_config *vpd); 1363 struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd,
1361 struct seeprom_config *sc); 1365 struct seeprom_config *sc);
1362void ahd_intr_enable(struct ahd_softc *ahd, int enable); 1366void ahd_intr_enable(struct ahd_softc *ahd, int enable);
1363void ahd_pause_and_flushwork(struct ahd_softc *ahd); 1367void ahd_pause_and_flushwork(struct ahd_softc *ahd);
1364int ahd_suspend(struct ahd_softc *ahd);
1365void ahd_set_unit(struct ahd_softc *, int); 1368void ahd_set_unit(struct ahd_softc *, int);
1366void ahd_set_name(struct ahd_softc *, char *); 1369void ahd_set_name(struct ahd_softc *, char *);
1367struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); 1370struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index a7dd8cdda472..ade0fb8fbdb2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
7175 ahd->flags &= ~AHD_ALL_INTERRUPTS; 7175 ahd->flags &= ~AHD_ALL_INTERRUPTS;
7176} 7176}
7177 7177
7178#ifdef CONFIG_PM
7178int 7179int
7179ahd_suspend(struct ahd_softc *ahd) 7180ahd_suspend(struct ahd_softc *ahd)
7180{ 7181{
@@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd)
7197 ahd_intr_enable(ahd, TRUE); 7198 ahd_intr_enable(ahd, TRUE);
7198 ahd_restart(ahd); 7199 ahd_restart(ahd);
7199} 7200}
7201#endif
7200 7202
7201/************************** Busy Target Table *********************************/ 7203/************************** Busy Target Table *********************************/
7202/* 7204/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 0e4708fd43c8..014654792901 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = {
766 .max_sectors = 8192, 766 .max_sectors = 8192,
767 .cmd_per_lun = 2, 767 .cmd_per_lun = 2,
768 .use_clustering = ENABLE_CLUSTERING, 768 .use_clustering = ENABLE_CLUSTERING,
769 .use_sg_chaining = ENABLE_SG_CHAINING,
770 .slave_alloc = ahd_linux_slave_alloc, 769 .slave_alloc = ahd_linux_slave_alloc,
771 .slave_configure = ahd_linux_slave_configure, 770 .slave_configure = ahd_linux_slave_configure,
772 .target_alloc = ahd_linux_target_alloc, 771 .target_alloc = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
1922 struct scsi_sense_data *sense; 1921 struct scsi_sense_data *sense;
1923 1922
1924 sense = (struct scsi_sense_data *) 1923 sense = (struct scsi_sense_data *)
1925 &cmd->sense_buffer; 1924 cmd->sense_buffer;
1926 if (sense->extra_len >= 5 && 1925 if (sense->extra_len >= 5 &&
1927 (sense->add_sense_code == 0x47 1926 (sense->add_sense_code == 0x47
1928 || sense->add_sense_code == 0x48)) 1927 || sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 66f0259edb69..4150c8a8fdc2 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
43#include "aic79xx_inline.h" 43#include "aic79xx_inline.h"
44#include "aic79xx_pci.h" 44#include "aic79xx_pci.h"
45 45
46static int ahd_linux_pci_dev_probe(struct pci_dev *pdev,
47 const struct pci_device_id *ent);
48static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
49 u_long *base, u_long *base2);
50static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
51 u_long *bus_addr,
52 uint8_t __iomem **maddr);
53static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
54static int ahd_linux_pci_dev_resume(struct pci_dev *pdev);
55static void ahd_linux_pci_dev_remove(struct pci_dev *pdev);
56
57/* Define the macro locally since it's different for different class of chips. 46/* Define the macro locally since it's different for different class of chips.
58 */ 47 */
59#define ID(x) \ 48#define ID(x) \
@@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
85 74
86MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); 75MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
87 76
88static struct pci_driver aic79xx_pci_driver = {
89 .name = "aic79xx",
90 .probe = ahd_linux_pci_dev_probe,
91#ifdef CONFIG_PM 77#ifdef CONFIG_PM
92 .suspend = ahd_linux_pci_dev_suspend,
93 .resume = ahd_linux_pci_dev_resume,
94#endif
95 .remove = ahd_linux_pci_dev_remove,
96 .id_table = ahd_linux_pci_id_table
97};
98
99static int 78static int
100ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 79ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
101{ 80{
@@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
139 118
140 return rc; 119 return rc;
141} 120}
121#endif
142 122
143static void 123static void
144ahd_linux_pci_dev_remove(struct pci_dev *pdev) 124ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
245 return (0); 225 return (0);
246} 226}
247 227
228static struct pci_driver aic79xx_pci_driver = {
229 .name = "aic79xx",
230 .probe = ahd_linux_pci_dev_probe,
231#ifdef CONFIG_PM
232 .suspend = ahd_linux_pci_dev_suspend,
233 .resume = ahd_linux_pci_dev_resume,
234#endif
235 .remove = ahd_linux_pci_dev_remove,
236 .id_table = ahd_linux_pci_id_table
237};
238
248int 239int
249ahd_linux_pci_init(void) 240ahd_linux_pci_init(void)
250{ 241{
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 7a203a90601a..df853676e66a 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
389 return error; 389 return error;
390} 390}
391 391
392#ifdef CONFIG_PM
392void 393void
393ahd_pci_suspend(struct ahd_softc *ahd) 394ahd_pci_suspend(struct ahd_softc *ahd)
394{ 395{
@@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd)
415 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, 416 ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
416 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); 417 ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
417} 418}
419#endif
418 420
419/* 421/*
420 * Perform some simple tests that should catch situations where 422 * Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 3d4e42d90452..c0344e617651 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t);
1143int ahc_pci_config(struct ahc_softc *, 1143int ahc_pci_config(struct ahc_softc *,
1144 struct ahc_pci_identity *); 1144 struct ahc_pci_identity *);
1145int ahc_pci_test_register_access(struct ahc_softc *); 1145int ahc_pci_test_register_access(struct ahc_softc *);
1146#ifdef CONFIG_PM
1146void ahc_pci_resume(struct ahc_softc *ahc); 1147void ahc_pci_resume(struct ahc_softc *ahc);
1148#endif
1147 1149
1148/*************************** EISA/VL Front End ********************************/ 1150/*************************** EISA/VL Front End ********************************/
1149struct aic7770_identity *aic7770_find_device(uint32_t); 1151struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc);
1170int ahc_init(struct ahc_softc *ahc); 1172int ahc_init(struct ahc_softc *ahc);
1171void ahc_intr_enable(struct ahc_softc *ahc, int enable); 1173void ahc_intr_enable(struct ahc_softc *ahc, int enable);
1172void ahc_pause_and_flushwork(struct ahc_softc *ahc); 1174void ahc_pause_and_flushwork(struct ahc_softc *ahc);
1175#ifdef CONFIG_PM
1173int ahc_suspend(struct ahc_softc *ahc); 1176int ahc_suspend(struct ahc_softc *ahc);
1174int ahc_resume(struct ahc_softc *ahc); 1177int ahc_resume(struct ahc_softc *ahc);
1178#endif
1175void ahc_set_unit(struct ahc_softc *, int); 1179void ahc_set_unit(struct ahc_softc *, int);
1176void ahc_set_name(struct ahc_softc *, char *); 1180void ahc_set_name(struct ahc_softc *, char *);
1177void ahc_alloc_scbs(struct ahc_softc *ahc); 1181void ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index f350b5e89e76..6d2ae641273c 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
5078 ahc->flags &= ~AHC_ALL_INTERRUPTS; 5078 ahc->flags &= ~AHC_ALL_INTERRUPTS;
5079} 5079}
5080 5080
5081#ifdef CONFIG_PM
5081int 5082int
5082ahc_suspend(struct ahc_softc *ahc) 5083ahc_suspend(struct ahc_softc *ahc)
5083{ 5084{
@@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc)
5113 ahc_restart(ahc); 5114 ahc_restart(ahc);
5114 return (0); 5115 return (0);
5115} 5116}
5116 5117#endif
5117/************************** Busy Target Table *********************************/ 5118/************************** Busy Target Table *********************************/
5118/* 5119/*
5119 * Return the untagged transaction id for a given target/channel lun. 5120 * Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index e310e414067f..99a3b33a3233 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = {
747 .max_sectors = 8192, 747 .max_sectors = 8192,
748 .cmd_per_lun = 2, 748 .cmd_per_lun = 2,
749 .use_clustering = ENABLE_CLUSTERING, 749 .use_clustering = ENABLE_CLUSTERING,
750 .use_sg_chaining = ENABLE_SG_CHAINING,
751 .slave_alloc = ahc_linux_slave_alloc, 750 .slave_alloc = ahc_linux_slave_alloc,
752 .slave_configure = ahc_linux_slave_configure, 751 .slave_configure = ahc_linux_slave_configure,
753 .target_alloc = ahc_linux_target_alloc, 752 .target_alloc = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
1658 untagged_q = &(ahc->untagged_queues[target_offset]); 1657 untagged_q = &(ahc->untagged_queues[target_offset]);
1659 TAILQ_REMOVE(untagged_q, scb, links.tqe); 1658 TAILQ_REMOVE(untagged_q, scb, links.tqe);
1660 BUG_ON(!TAILQ_EMPTY(untagged_q)); 1659 BUG_ON(!TAILQ_EMPTY(untagged_q));
1661 } 1660 } else if ((scb->flags & SCB_ACTIVE) == 0) {
1662 1661 /*
1663 if ((scb->flags & SCB_ACTIVE) == 0) { 1662 * Transactions aborted from the untagged queue may
1663 * not have been dispatched to the controller, so
1664 * only check the SCB_ACTIVE flag for tagged transactions.
1665 */
1664 printf("SCB %d done'd twice\n", scb->hscb->tag); 1666 printf("SCB %d done'd twice\n", scb->hscb->tag);
1665 ahc_dump_card_state(ahc); 1667 ahc_dump_card_state(ahc);
1666 panic("Stopping for safety"); 1668 panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 4488946cff2e..dd6e21d6f1dd 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
42#include "aic7xxx_osm.h" 42#include "aic7xxx_osm.h"
43#include "aic7xxx_pci.h" 43#include "aic7xxx_pci.h"
44 44
45static int ahc_linux_pci_dev_probe(struct pci_dev *pdev,
46 const struct pci_device_id *ent);
47static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
48 u_long *base);
49static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
50 u_long *bus_addr,
51 uint8_t __iomem **maddr);
52static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
53static int ahc_linux_pci_dev_resume(struct pci_dev *pdev);
54static void ahc_linux_pci_dev_remove(struct pci_dev *pdev);
55
56/* Define the macro locally since it's different for different class of chips. 45/* Define the macro locally since it's different for different class of chips.
57*/ 46*/
58#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) 47#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
132 121
133MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); 122MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
134 123
135static struct pci_driver aic7xxx_pci_driver = {
136 .name = "aic7xxx",
137 .probe = ahc_linux_pci_dev_probe,
138#ifdef CONFIG_PM 124#ifdef CONFIG_PM
139 .suspend = ahc_linux_pci_dev_suspend,
140 .resume = ahc_linux_pci_dev_resume,
141#endif
142 .remove = ahc_linux_pci_dev_remove,
143 .id_table = ahc_linux_pci_id_table
144};
145
146static int 125static int
147ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) 126ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
148{ 127{
@@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
182 161
183 return (ahc_resume(ahc)); 162 return (ahc_resume(ahc));
184} 163}
164#endif
185 165
186static void 166static void
187ahc_linux_pci_dev_remove(struct pci_dev *pdev) 167ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
289 return (0); 269 return (0);
290} 270}
291 271
272static struct pci_driver aic7xxx_pci_driver = {
273 .name = "aic7xxx",
274 .probe = ahc_linux_pci_dev_probe,
275#ifdef CONFIG_PM
276 .suspend = ahc_linux_pci_dev_suspend,
277 .resume = ahc_linux_pci_dev_resume,
278#endif
279 .remove = ahc_linux_pci_dev_remove,
280 .id_table = ahc_linux_pci_id_table
281};
282
292int 283int
293ahc_linux_pci_init(void) 284ahc_linux_pci_init(void)
294{ 285{
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c
index ae35937b8055..56848f41e4f9 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
2020 return (ahc_chip_init(ahc)); 2020 return (ahc_chip_init(ahc));
2021} 2021}
2022 2022
2023#ifdef CONFIG_PM
2023void 2024void
2024ahc_pci_resume(struct ahc_softc *ahc) 2025ahc_pci_resume(struct ahc_softc *ahc)
2025{ 2026{
@@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc)
2051 ahc_release_seeprom(&sd); 2052 ahc_release_seeprom(&sd);
2052 } 2053 }
2053} 2054}
2055#endif
2054 2056
2055static int 2057static int
2056ahc_aic785X_setup(struct ahc_softc *ahc) 2058ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index bcb0b870320c..3bfd9296bbfa 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = {
11141 .max_sectors = 2048, 11141 .max_sectors = 2048,
11142 .cmd_per_lun = 3, 11142 .cmd_per_lun = 3,
11143 .use_clustering = ENABLE_CLUSTERING, 11143 .use_clustering = ENABLE_CLUSTERING,
11144 .use_sg_chaining = ENABLE_SG_CHAINING,
11145}; 11144};
11146 11145
11147#include "scsi_module.c" 11146#include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index d80dba913a75..f4a202e8df26 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
122 .max_sectors = ARCMSR_MAX_XFER_SECTORS, 122 .max_sectors = ARCMSR_MAX_XFER_SECTORS,
123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, 123 .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN,
124 .use_clustering = ENABLE_CLUSTERING, 124 .use_clustering = ENABLE_CLUSTERING,
125 .use_sg_chaining = ENABLE_SG_CHAINING,
126 .shost_attrs = arcmsr_host_attrs, 125 .shost_attrs = arcmsr_host_attrs,
127}; 126};
128#ifdef CONFIG_SCSI_ARCMSR_AER 127#ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f93c73c0ba53..22ef3716e786 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = {
4763 .eh_bus_reset_handler = dc395x_eh_bus_reset, 4763 .eh_bus_reset_handler = dc395x_eh_bus_reset,
4764 .unchecked_isa_dma = 0, 4764 .unchecked_isa_dma = 0,
4765 .use_clustering = DISABLE_CLUSTERING, 4765 .use_clustering = DISABLE_CLUSTERING,
4766 .use_sg_chaining = ENABLE_SG_CHAINING,
4767}; 4766};
4768 4767
4769 4768
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 19cce125124c..c9dd8392aab2 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = {
3340 .this_id = 7, 3340 .this_id = 7,
3341 .cmd_per_lun = 1, 3341 .cmd_per_lun = 1,
3342 .use_clustering = ENABLE_CLUSTERING, 3342 .use_clustering = ENABLE_CLUSTERING,
3343 .use_sg_chaining = ENABLE_SG_CHAINING,
3344}; 3343};
3345#include "scsi_module.c" 3344#include "scsi_module.c"
3346MODULE_LICENSE("GPL"); 3345MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 05163cefec12..8be3d76656fa 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = {
524 .this_id = 7, 524 .this_id = 7,
525 .unchecked_isa_dma = 1, 525 .unchecked_isa_dma = 1,
526 .use_clustering = ENABLE_CLUSTERING, 526 .use_clustering = ENABLE_CLUSTERING,
527 .use_sg_chaining = ENABLE_SG_CHAINING,
528}; 527};
529 528
530#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 529#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5ea1f986220c..880c78bff0e1 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
342 shost->use_clustering = sht->use_clustering; 342 shost->use_clustering = sht->use_clustering;
343 shost->ordered_tag = sht->ordered_tag; 343 shost->ordered_tag = sht->ordered_tag;
344 shost->active_mode = sht->supported_mode; 344 shost->active_mode = sht->supported_mode;
345 shost->use_sg_chaining = sht->use_sg_chaining;
346 345
347 if (sht->supported_mode == MODE_UNKNOWN) 346 if (sht->supported_mode == MODE_UNKNOWN)
348 /* means we didn't set it ... default to INITIATOR */ 347 /* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index e7b2f3575ce9..ff149ad6bc4e 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
573 scsi_set_resid(scp, 573 scsi_set_resid(scp,
574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); 574 scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
575 scp->result = SAM_STAT_CHECK_CONDITION; 575 scp->result = SAM_STAT_CHECK_CONDITION;
576 memcpy(&scp->sense_buffer, &req->sg_list, 576 memcpy(scp->sense_buffer, &req->sg_list,
577 min_t(size_t, SCSI_SENSE_BUFFERSIZE, 577 min_t(size_t, SCSI_SENSE_BUFFERSIZE,
578 le32_to_cpu(req->dataxfer_length))); 578 le32_to_cpu(req->dataxfer_length)));
579 break; 579 break;
@@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = {
906 .unchecked_isa_dma = 0, 906 .unchecked_isa_dma = 0,
907 .emulated = 0, 907 .emulated = 0,
908 .use_clustering = ENABLE_CLUSTERING, 908 .use_clustering = ENABLE_CLUSTERING,
909 .use_sg_chaining = ENABLE_SG_CHAINING,
910 .proc_name = driver_name, 909 .proc_name = driver_name,
911 .shost_attrs = hptiop_attrs, 910 .shost_attrs = hptiop_attrs,
912 .this_id = -1, 911 .this_id = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index db004a450732..4d15a62914e9 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = {
1501 .sg_tablesize = 16, 1501 .sg_tablesize = 16,
1502 .cmd_per_lun = 1, 1502 .cmd_per_lun = 1,
1503 .use_clustering = ENABLE_CLUSTERING, 1503 .use_clustering = ENABLE_CLUSTERING,
1504 .use_sg_chaining = ENABLE_SG_CHAINING,
1505}; 1504};
1506 1505
1507static int ibmmca_probe(struct device *dev) 1506static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 30819012898f..78d46a900bb5 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = {
1600 .this_id = -1, 1600 .this_id = -1,
1601 .sg_tablesize = SG_ALL, 1601 .sg_tablesize = SG_ALL,
1602 .use_clustering = ENABLE_CLUSTERING, 1602 .use_clustering = ENABLE_CLUSTERING,
1603 .use_sg_chaining = ENABLE_SG_CHAINING,
1604 .shost_attrs = ibmvscsi_attrs, 1603 .shost_attrs = ibmvscsi_attrs,
1605}; 1604};
1606 1605
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index a10a5c74b48d..0cc8868ea35d 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
2833 .sg_tablesize = SG_ALL, 2833 .sg_tablesize = SG_ALL,
2834 .cmd_per_lun = 1, 2834 .cmd_per_lun = 1,
2835 .use_clustering = ENABLE_CLUSTERING, 2835 .use_clustering = ENABLE_CLUSTERING,
2836 .use_sg_chaining = ENABLE_SG_CHAINING,
2837}; 2836};
2838 2837
2839static int initio_probe_one(struct pci_dev *pdev, 2838static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index e5be5fd4ef58..b6f99dfbb038 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = {
1933 .eh_device_reset_handler= iscsi_eh_device_reset, 1933 .eh_device_reset_handler= iscsi_eh_device_reset,
1934 .eh_host_reset_handler = iscsi_eh_host_reset, 1934 .eh_host_reset_handler = iscsi_eh_host_reset,
1935 .use_clustering = DISABLE_CLUSTERING, 1935 .use_clustering = DISABLE_CLUSTERING,
1936 .use_sg_chaining = ENABLE_SG_CHAINING,
1937 .slave_configure = iscsi_tcp_slave_configure, 1936 .slave_configure = iscsi_tcp_slave_configure,
1938 .proc_name = "iscsi_tcp", 1937 .proc_name = "iscsi_tcp",
1939 .this_id = -1, 1938 .this_id = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 5cff0204227d..6d6a76e65a6c 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
426 426
427 sc->SCp.ptr = info; 427 sc->SCp.ptr = info;
428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); 428 memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
429 sc->request_bufflen = len; 429 sc->sdb.length = len;
430 sc->request_buffer = (void *) (unsigned long) addr; 430 sc->sdb.table.sgl = (void *) (unsigned long) addr;
431 sc->tag = tag; 431 sc->tag = tag;
432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, 432 err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
433 cmd->tag); 433 cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 6483c62730b3..fc5c3a42b05a 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = {
1459 .scan_finished = lpfc_scan_finished, 1459 .scan_finished = lpfc_scan_finished,
1460 .this_id = -1, 1460 .this_id = -1,
1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1461 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1462 .use_sg_chaining = ENABLE_SG_CHAINING,
1463 .cmd_per_lun = LPFC_CMD_PER_LUN, 1462 .cmd_per_lun = LPFC_CMD_PER_LUN,
1464 .use_clustering = ENABLE_CLUSTERING, 1463 .use_clustering = ENABLE_CLUSTERING,
1465 .shost_attrs = lpfc_hba_attrs, 1464 .shost_attrs = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = {
1482 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, 1481 .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
1483 .cmd_per_lun = LPFC_CMD_PER_LUN, 1482 .cmd_per_lun = LPFC_CMD_PER_LUN,
1484 .use_clustering = ENABLE_CLUSTERING, 1483 .use_clustering = ENABLE_CLUSTERING,
1485 .use_sg_chaining = ENABLE_SG_CHAINING,
1486 .shost_attrs = lpfc_vport_attrs, 1484 .shost_attrs = lpfc_vport_attrs,
1487 .max_sectors = 0xFFFF, 1485 .max_sectors = 0xFFFF,
1488}; 1486};
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index a035001f4438..b12ad7c7c673 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = {
402 .sg_tablesize = SG_ALL, 402 .sg_tablesize = SG_ALL,
403 .cmd_per_lun = 1, 403 .cmd_per_lun = 1,
404 .use_clustering = DISABLE_CLUSTERING, 404 .use_clustering = DISABLE_CLUSTERING,
405 .use_sg_chaining = ENABLE_SG_CHAINING,
406}; 405};
407 406
408static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) 407static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 765c24d2bc38..4d59ae8491a4 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = {
4490 .sg_tablesize = MAX_SGLIST, 4490 .sg_tablesize = MAX_SGLIST,
4491 .cmd_per_lun = DEF_CMD_PER_LUN, 4491 .cmd_per_lun = DEF_CMD_PER_LUN,
4492 .use_clustering = ENABLE_CLUSTERING, 4492 .use_clustering = ENABLE_CLUSTERING,
4493 .use_sg_chaining = ENABLE_SG_CHAINING,
4494 .eh_abort_handler = megaraid_abort, 4493 .eh_abort_handler = megaraid_abort,
4495 .eh_device_reset_handler = megaraid_reset, 4494 .eh_device_reset_handler = megaraid_reset,
4496 .eh_bus_reset_handler = megaraid_reset, 4495 .eh_bus_reset_handler = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 24e32e446e76..6db77c00e3ee 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = {
361 .eh_host_reset_handler = megaraid_reset_handler, 361 .eh_host_reset_handler = megaraid_reset_handler,
362 .change_queue_depth = megaraid_change_queue_depth, 362 .change_queue_depth = megaraid_change_queue_depth,
363 .use_clustering = ENABLE_CLUSTERING, 363 .use_clustering = ENABLE_CLUSTERING,
364 .use_sg_chaining = ENABLE_SG_CHAINING,
365 .sdev_attrs = megaraid_sdev_attrs, 364 .sdev_attrs = megaraid_sdev_attrs,
366 .shost_attrs = megaraid_shost_attrs, 365 .shost_attrs = megaraid_shost_attrs,
367}; 366};
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index d7ec921865c4..672c759ac24d 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = {
1192 .eh_timed_out = megasas_reset_timer, 1192 .eh_timed_out = megasas_reset_timer,
1193 .bios_param = megasas_bios_param, 1193 .bios_param = megasas_bios_param,
1194 .use_clustering = ENABLE_CLUSTERING, 1194 .use_clustering = ENABLE_CLUSTERING,
1195 .use_sg_chaining = ENABLE_SG_CHAINING,
1196}; 1195};
1197 1196
1198/** 1197/**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 7470ff39ab22..651d09b08f2a 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = {
1843 .sg_tablesize = SG_ALL, 1843 .sg_tablesize = SG_ALL,
1844 .cmd_per_lun = 2, 1844 .cmd_per_lun = 2,
1845 .use_clustering = DISABLE_CLUSTERING, 1845 .use_clustering = DISABLE_CLUSTERING,
1846 .use_sg_chaining = ENABLE_SG_CHAINING,
1847}; 1846};
1848 1847
1849static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) 1848static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c02771aa6c9b..c5ebf018b378 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
4967 sizeof(cp->sense_buf))); 4967 sizeof(cp->sense_buf)));
4968 4968
4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { 4969 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
4970 u_char * p = (u_char*) & cmd->sense_buffer; 4970 u_char *p = cmd->sense_buffer;
4971 int i; 4971 int i;
4972 PRINT_ADDR(cmd, "sense data:"); 4972 PRINT_ADDR(cmd, "sense data:");
4973 for (i=0; i<14; i++) printk (" %x", *p++); 4973 for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index 28161dc95e0d..7fed35372150 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = {
281 .cmd_per_lun = 1, 281 .cmd_per_lun = 1,
282 .this_id = NSP32_HOST_SCSIID, 282 .this_id = NSP32_HOST_SCSIID,
283 .use_clustering = DISABLE_CLUSTERING, 283 .use_clustering = DISABLE_CLUSTERING,
284 .use_sg_chaining = ENABLE_SG_CHAINING,
285 .eh_abort_handler = nsp32_eh_abort, 284 .eh_abort_handler = nsp32_eh_abort,
286 .eh_bus_reset_handler = nsp32_eh_bus_reset, 285 .eh_bus_reset_handler = nsp32_eh_bus_reset,
287 .eh_host_reset_handler = nsp32_eh_host_reset, 286 .eh_host_reset_handler = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 969b9387a0c3..3454a5714749 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = {
692 .sg_tablesize = 32, 692 .sg_tablesize = 32,
693 .cmd_per_lun = 1, 693 .cmd_per_lun = 1,
694 .use_clustering = ENABLE_CLUSTERING, 694 .use_clustering = ENABLE_CLUSTERING,
695 .use_sg_chaining = ENABLE_SG_CHAINING,
696 .shost_attrs = SYM53C500_shost_attrs 695 .shost_attrs = SYM53C500_shost_attrs
697}; 696};
698 697
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c94906abfee3..68c0d09ffe78 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = {
4204 .sg_tablesize = SG_ALL, 4204 .sg_tablesize = SG_ALL,
4205 .cmd_per_lun = 1, 4205 .cmd_per_lun = 1,
4206 .use_clustering = ENABLE_CLUSTERING, 4206 .use_clustering = ENABLE_CLUSTERING,
4207 .use_sg_chaining = ENABLE_SG_CHAINING,
4208}; 4207};
4209 4208
4210 4209
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index aba1e6d48066..3954ed2d7b51 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = {
131 .this_id = -1, 131 .this_id = -1,
132 .cmd_per_lun = 3, 132 .cmd_per_lun = 3,
133 .use_clustering = ENABLE_CLUSTERING, 133 .use_clustering = ENABLE_CLUSTERING,
134 .use_sg_chaining = ENABLE_SG_CHAINING,
135 .sg_tablesize = SG_ALL, 134 .sg_tablesize = SG_ALL,
136 135
137 /* 136 /*
@@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = {
163 .this_id = -1, 162 .this_id = -1,
164 .cmd_per_lun = 3, 163 .cmd_per_lun = 3,
165 .use_clustering = ENABLE_CLUSTERING, 164 .use_clustering = ENABLE_CLUSTERING,
166 .use_sg_chaining = ENABLE_SG_CHAINING,
167 .sg_tablesize = SG_ALL, 165 .sg_tablesize = SG_ALL,
168 166
169 .max_sectors = 0xFFFF, 167 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d3f86646cb08..2e2b9fedffcc 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = {
94 .this_id = -1, 94 .this_id = -1,
95 .cmd_per_lun = 3, 95 .cmd_per_lun = 3,
96 .use_clustering = ENABLE_CLUSTERING, 96 .use_clustering = ENABLE_CLUSTERING,
97 .use_sg_chaining = ENABLE_SG_CHAINING,
98 .sg_tablesize = SG_ALL, 97 .sg_tablesize = SG_ALL,
99 98
100 .max_sectors = 0xFFFF, 99 .max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index 1769f965eedf..1e874f1fb5c6 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
197 .sg_tablesize = SG_ALL, 197 .sg_tablesize = SG_ALL,
198 .cmd_per_lun = 1, 198 .cmd_per_lun = 1,
199 .use_clustering = DISABLE_CLUSTERING, 199 .use_clustering = DISABLE_CLUSTERING,
200 .use_sg_chaining = ENABLE_SG_CHAINING,
201}; 200};
202 201
203static __init int qlogicfas_init(void) 202static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1a9fba6a9f92..b35d19472caa 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
757 "Notifying upper driver of completion " 757 "Notifying upper driver of completion "
758 "(result %x)\n", cmd->result)); 758 "(result %x)\n", cmd->result));
759 759
760 good_bytes = cmd->request_bufflen; 760 good_bytes = scsi_bufflen(cmd);
761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { 761 if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
762 drv = scsi_cmd_to_driver(cmd); 762 drv = scsi_cmd_to_driver(cmd);
763 if (drv->done) 763 if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 82c06f0a9d02..1541c174937a 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
280 unsigned int num, struct sdebug_dev_info * devip); 280 unsigned int num, struct sdebug_dev_info * devip);
281static int resp_report_luns(struct scsi_cmnd * SCpnt, 281static int resp_report_luns(struct scsi_cmnd * SCpnt,
282 struct sdebug_dev_info * devip); 282 struct sdebug_dev_info * devip);
283static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
284 unsigned int num, struct sdebug_dev_info *devip);
283static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 285static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
284 int arr_len); 286 int arr_len);
285static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, 287static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void);
311static struct device pseudo_primary; 313static struct device pseudo_primary;
312static struct bus_type pseudo_lld_bus; 314static struct bus_type pseudo_lld_bus;
313 315
316static void get_data_transfer_info(unsigned char *cmd,
317 unsigned long long *lba, unsigned int *num)
318{
319 int i;
320
321 switch (*cmd) {
322 case WRITE_16:
323 case READ_16:
324 for (*lba = 0, i = 0; i < 8; ++i) {
325 if (i > 0)
326 *lba <<= 8;
327 *lba += cmd[2 + i];
328 }
329 *num = cmd[13] + (cmd[12] << 8) +
330 (cmd[11] << 16) + (cmd[10] << 24);
331 break;
332 case WRITE_12:
333 case READ_12:
334 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
335 *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
336 break;
337 case WRITE_10:
338 case READ_10:
339 case XDWRITEREAD_10:
340 *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
341 *num = cmd[8] + (cmd[7] << 8);
342 break;
343 case WRITE_6:
344 case READ_6:
345 *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
346 *num = (0 == cmd[4]) ? 256 : cmd[4];
347 break;
348 default:
349 break;
350 }
351}
314 352
315static 353static
316int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) 354int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
317{ 355{
318 unsigned char *cmd = (unsigned char *) SCpnt->cmnd; 356 unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
319 int len, k, j; 357 int len, k;
320 unsigned int num; 358 unsigned int num;
321 unsigned long long lba; 359 unsigned long long lba;
322 int errsts = 0; 360 int errsts = 0;
@@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
452 break; 490 break;
453 if (scsi_debug_fake_rw) 491 if (scsi_debug_fake_rw)
454 break; 492 break;
455 if ((*cmd) == READ_16) { 493 get_data_transfer_info(cmd, &lba, &num);
456 for (lba = 0, j = 0; j < 8; ++j) {
457 if (j > 0)
458 lba <<= 8;
459 lba += cmd[2 + j];
460 }
461 num = cmd[13] + (cmd[12] << 8) +
462 (cmd[11] << 16) + (cmd[10] << 24);
463 } else if ((*cmd) == READ_12) {
464 lba = cmd[5] + (cmd[4] << 8) +
465 (cmd[3] << 16) + (cmd[2] << 24);
466 num = cmd[9] + (cmd[8] << 8) +
467 (cmd[7] << 16) + (cmd[6] << 24);
468 } else if ((*cmd) == READ_10) {
469 lba = cmd[5] + (cmd[4] << 8) +
470 (cmd[3] << 16) + (cmd[2] << 24);
471 num = cmd[8] + (cmd[7] << 8);
472 } else { /* READ (6) */
473 lba = cmd[3] + (cmd[2] << 8) +
474 ((cmd[1] & 0x1f) << 16);
475 num = (0 == cmd[4]) ? 256 : cmd[4];
476 }
477 errsts = resp_read(SCpnt, lba, num, devip); 494 errsts = resp_read(SCpnt, lba, num, devip);
478 if (inj_recovered && (0 == errsts)) { 495 if (inj_recovered && (0 == errsts)) {
479 mk_sense_buffer(devip, RECOVERED_ERROR, 496 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
500 break; 517 break;
501 if (scsi_debug_fake_rw) 518 if (scsi_debug_fake_rw)
502 break; 519 break;
503 if ((*cmd) == WRITE_16) { 520 get_data_transfer_info(cmd, &lba, &num);
504 for (lba = 0, j = 0; j < 8; ++j) {
505 if (j > 0)
506 lba <<= 8;
507 lba += cmd[2 + j];
508 }
509 num = cmd[13] + (cmd[12] << 8) +
510 (cmd[11] << 16) + (cmd[10] << 24);
511 } else if ((*cmd) == WRITE_12) {
512 lba = cmd[5] + (cmd[4] << 8) +
513 (cmd[3] << 16) + (cmd[2] << 24);
514 num = cmd[9] + (cmd[8] << 8) +
515 (cmd[7] << 16) + (cmd[6] << 24);
516 } else if ((*cmd) == WRITE_10) {
517 lba = cmd[5] + (cmd[4] << 8) +
518 (cmd[3] << 16) + (cmd[2] << 24);
519 num = cmd[8] + (cmd[7] << 8);
520 } else { /* WRITE (6) */
521 lba = cmd[3] + (cmd[2] << 8) +
522 ((cmd[1] & 0x1f) << 16);
523 num = (0 == cmd[4]) ? 256 : cmd[4];
524 }
525 errsts = resp_write(SCpnt, lba, num, devip); 521 errsts = resp_write(SCpnt, lba, num, devip);
526 if (inj_recovered && (0 == errsts)) { 522 if (inj_recovered && (0 == errsts)) {
527 mk_sense_buffer(devip, RECOVERED_ERROR, 523 mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
549 case WRITE_BUFFER: 545 case WRITE_BUFFER:
550 errsts = check_readiness(SCpnt, 1, devip); 546 errsts = check_readiness(SCpnt, 1, devip);
551 break; 547 break;
548 case XDWRITEREAD_10:
549 if (!scsi_bidi_cmnd(SCpnt)) {
550 mk_sense_buffer(devip, ILLEGAL_REQUEST,
551 INVALID_FIELD_IN_CDB, 0);
552 errsts = check_condition_result;
553 break;
554 }
555
556 errsts = check_readiness(SCpnt, 0, devip);
557 if (errsts)
558 break;
559 if (scsi_debug_fake_rw)
560 break;
561 get_data_transfer_info(cmd, &lba, &num);
562 errsts = resp_read(SCpnt, lba, num, devip);
563 if (errsts)
564 break;
565 errsts = resp_write(SCpnt, lba, num, devip);
566 if (errsts)
567 break;
568 errsts = resp_xdwriteread(SCpnt, lba, num, devip);
569 break;
552 default: 570 default:
553 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 571 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
554 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " 572 printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
601 int k, req_len, act_len, len, active; 619 int k, req_len, act_len, len, active;
602 void * kaddr; 620 void * kaddr;
603 void * kaddr_off; 621 void * kaddr_off;
604 struct scatterlist * sg; 622 struct scatterlist *sg;
623 struct scsi_data_buffer *sdb = scsi_in(scp);
605 624
606 if (0 == scsi_bufflen(scp)) 625 if (!sdb->length)
607 return 0; 626 return 0;
608 if (NULL == scsi_sglist(scp)) 627 if (!sdb->table.sgl)
609 return (DID_ERROR << 16); 628 return (DID_ERROR << 16);
610 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 629 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
611 (scp->sc_data_direction == DMA_FROM_DEVICE)))
612 return (DID_ERROR << 16); 630 return (DID_ERROR << 16);
613 active = 1; 631 active = 1;
614 req_len = act_len = 0; 632 req_len = act_len = 0;
615 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 633 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
616 if (active) { 634 if (active) {
617 kaddr = (unsigned char *) 635 kaddr = (unsigned char *)
618 kmap_atomic(sg_page(sg), KM_USER0); 636 kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
630 } 648 }
631 req_len += sg->length; 649 req_len += sg->length;
632 } 650 }
633 if (scsi_get_resid(scp)) 651 if (sdb->resid)
634 scsi_set_resid(scp, scsi_get_resid(scp) - act_len); 652 sdb->resid -= act_len;
635 else 653 else
636 scsi_set_resid(scp, req_len - act_len); 654 sdb->resid = req_len - act_len;
637 return 0; 655 return 0;
638} 656}
639 657
@@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
650 return 0; 668 return 0;
651 if (NULL == scsi_sglist(scp)) 669 if (NULL == scsi_sglist(scp))
652 return -1; 670 return -1;
653 if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || 671 if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
654 (scp->sc_data_direction == DMA_TO_DEVICE)))
655 return -1; 672 return -1;
656 req_len = fin = 0; 673 req_len = fin = 0;
657 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { 674 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp,
1956 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); 1973 min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
1957} 1974}
1958 1975
1976static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
1977 unsigned int num, struct sdebug_dev_info *devip)
1978{
1979 int i, j, ret = -1;
1980 unsigned char *kaddr, *buf;
1981 unsigned int offset;
1982 struct scatterlist *sg;
1983 struct scsi_data_buffer *sdb = scsi_in(scp);
1984
1985 /* better not to use temporary buffer. */
1986 buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
1987 if (!buf)
1988 return ret;
1989
1990 offset = 0;
1991 scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
1992 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
1993 if (!kaddr)
1994 goto out;
1995
1996 memcpy(buf + offset, kaddr + sg->offset, sg->length);
1997 offset += sg->length;
1998 kunmap_atomic(kaddr, KM_USER0);
1999 }
2000
2001 offset = 0;
2002 for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
2003 kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
2004 if (!kaddr)
2005 goto out;
2006
2007 for (j = 0; j < sg->length; j++)
2008 *(kaddr + sg->offset + j) ^= *(buf + offset + j);
2009
2010 offset += sg->length;
2011 kunmap_atomic(kaddr, KM_USER0);
2012 }
2013 ret = 0;
2014out:
2015 kfree(buf);
2016
2017 return ret;
2018}
2019
1959/* When timer goes off this function is called. */ 2020/* When timer goes off this function is called. */
1960static void timer_intr_handler(unsigned long indx) 2021static void timer_intr_handler(unsigned long indx)
1961{ 2022{
@@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
1989 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) 2050 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
1990 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", 2051 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
1991 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); 2052 sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
2053 set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
1992 return 0; 2054 return 0;
1993} 2055}
1994 2056
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 547e85aa414f..045a0868fc7b 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
617 ses->cmd_len = scmd->cmd_len; 617 ses->cmd_len = scmd->cmd_len;
618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); 618 memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
619 ses->data_direction = scmd->sc_data_direction; 619 ses->data_direction = scmd->sc_data_direction;
620 ses->bufflen = scmd->request_bufflen; 620 ses->sdb = scmd->sdb;
621 ses->buffer = scmd->request_buffer; 621 ses->next_rq = scmd->request->next_rq;
622 ses->use_sg = scmd->use_sg;
623 ses->resid = scmd->resid;
624 ses->result = scmd->result; 622 ses->result = scmd->result;
625 623
624 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
625 scmd->request->next_rq = NULL;
626
626 if (sense_bytes) { 627 if (sense_bytes) {
627 scmd->request_bufflen = min_t(unsigned, 628 scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
628 SCSI_SENSE_BUFFERSIZE, sense_bytes); 629 sense_bytes);
629 sg_init_one(&ses->sense_sgl, scmd->sense_buffer, 630 sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
630 scmd->request_bufflen); 631 scmd->sdb.length);
631 scmd->request_buffer = &ses->sense_sgl; 632 scmd->sdb.table.sgl = &ses->sense_sgl;
632 scmd->sc_data_direction = DMA_FROM_DEVICE; 633 scmd->sc_data_direction = DMA_FROM_DEVICE;
633 scmd->use_sg = 1; 634 scmd->sdb.table.nents = 1;
634 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 635 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
635 scmd->cmnd[0] = REQUEST_SENSE; 636 scmd->cmnd[0] = REQUEST_SENSE;
636 scmd->cmnd[4] = scmd->request_bufflen; 637 scmd->cmnd[4] = scmd->sdb.length;
637 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); 638 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
638 } else { 639 } else {
639 scmd->request_buffer = NULL;
640 scmd->request_bufflen = 0;
641 scmd->sc_data_direction = DMA_NONE; 640 scmd->sc_data_direction = DMA_NONE;
642 scmd->use_sg = 0;
643 if (cmnd) { 641 if (cmnd) {
644 memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); 642 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
645 memcpy(scmd->cmnd, cmnd, cmnd_size); 643 memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
676 scmd->cmd_len = ses->cmd_len; 674 scmd->cmd_len = ses->cmd_len;
677 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); 675 memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
678 scmd->sc_data_direction = ses->data_direction; 676 scmd->sc_data_direction = ses->data_direction;
679 scmd->request_bufflen = ses->bufflen; 677 scmd->sdb = ses->sdb;
680 scmd->request_buffer = ses->buffer; 678 scmd->request->next_rq = ses->next_rq;
681 scmd->use_sg = ses->use_sg;
682 scmd->resid = ses->resid;
683 scmd->result = ses->result; 679 scmd->result = ses->result;
684} 680}
685EXPORT_SYMBOL(scsi_eh_restore_cmnd); 681EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
1700 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); 1696 memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
1701 1697
1702 scmd->scsi_done = scsi_reset_provider_done_command; 1698 scmd->scsi_done = scsi_reset_provider_done_command;
1703 scmd->request_buffer = NULL; 1699 memset(&scmd->sdb, 0, sizeof(scmd->sdb));
1704 scmd->request_bufflen = 0;
1705 1700
1706 scmd->cmd_len = 0; 1701 scmd->cmd_len = 0;
1707 1702
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7c4c889c5221..b12fb310e399 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/bitops.h>
11#include <linux/blkdev.h> 12#include <linux/blkdev.h>
12#include <linux/completion.h> 13#include <linux/completion.h>
13#include <linux/kernel.h> 14#include <linux/kernel.h>
@@ -34,13 +35,6 @@
34#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) 35#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
35#define SG_MEMPOOL_SIZE 2 36#define SG_MEMPOOL_SIZE 2
36 37
37/*
38 * The maximum number of SG segments that we will put inside a scatterlist
39 * (unless chaining is used). Should ideally fit inside a single page, to
40 * avoid a higher order allocation.
41 */
42#define SCSI_MAX_SG_SEGMENTS 128
43
44struct scsi_host_sg_pool { 38struct scsi_host_sg_pool {
45 size_t size; 39 size_t size;
46 char *name; 40 char *name;
@@ -48,22 +42,31 @@ struct scsi_host_sg_pool {
48 mempool_t *pool; 42 mempool_t *pool;
49}; 43};
50 44
51#define SP(x) { x, "sgpool-" #x } 45#define SP(x) { x, "sgpool-" __stringify(x) }
46#if (SCSI_MAX_SG_SEGMENTS < 32)
47#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
48#endif
52static struct scsi_host_sg_pool scsi_sg_pools[] = { 49static struct scsi_host_sg_pool scsi_sg_pools[] = {
53 SP(8), 50 SP(8),
54 SP(16), 51 SP(16),
55#if (SCSI_MAX_SG_SEGMENTS > 16)
56 SP(32),
57#if (SCSI_MAX_SG_SEGMENTS > 32) 52#if (SCSI_MAX_SG_SEGMENTS > 32)
58 SP(64), 53 SP(32),
59#if (SCSI_MAX_SG_SEGMENTS > 64) 54#if (SCSI_MAX_SG_SEGMENTS > 64)
55 SP(64),
56#if (SCSI_MAX_SG_SEGMENTS > 128)
60 SP(128), 57 SP(128),
58#if (SCSI_MAX_SG_SEGMENTS > 256)
59#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
60#endif
61#endif 61#endif
62#endif 62#endif
63#endif 63#endif
64 SP(SCSI_MAX_SG_SEGMENTS)
64}; 65};
65#undef SP 66#undef SP
66 67
68static struct kmem_cache *scsi_bidi_sdb_cache;
69
67static void scsi_run_queue(struct request_queue *q); 70static void scsi_run_queue(struct request_queue *q);
68 71
69/* 72/*
@@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
440static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) 443static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
441{ 444{
442 cmd->serial_number = 0; 445 cmd->serial_number = 0;
443 cmd->resid = 0; 446 scsi_set_resid(cmd, 0);
444 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 447 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
445 if (cmd->cmd_len == 0) 448 if (cmd->cmd_len == 0)
446 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); 449 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
690 return NULL; 693 return NULL;
691} 694}
692 695
693/*
694 * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
695 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
696 */
697#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
698
699static inline unsigned int scsi_sgtable_index(unsigned short nents) 696static inline unsigned int scsi_sgtable_index(unsigned short nents)
700{ 697{
701 unsigned int index; 698 unsigned int index;
702 699
703 switch (nents) { 700 BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
704 case 1 ... 8: 701
702 if (nents <= 8)
705 index = 0; 703 index = 0;
706 break; 704 else
707 case 9 ... 16: 705 index = get_count_order(nents) - 3;
708 index = 1;
709 break;
710#if (SCSI_MAX_SG_SEGMENTS > 16)
711 case 17 ... 32:
712 index = 2;
713 break;
714#if (SCSI_MAX_SG_SEGMENTS > 32)
715 case 33 ... 64:
716 index = 3;
717 break;
718#if (SCSI_MAX_SG_SEGMENTS > 64)
719 case 65 ... 128:
720 index = 4;
721 break;
722#endif
723#endif
724#endif
725 default:
726 printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
727 BUG();
728 }
729 706
730 return index; 707 return index;
731} 708}
@@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
746 return mempool_alloc(sgp->pool, gfp_mask); 723 return mempool_alloc(sgp->pool, gfp_mask);
747} 724}
748 725
749int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) 726static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
727 gfp_t gfp_mask)
750{ 728{
751 int ret; 729 int ret;
752 730
753 BUG_ON(!cmd->use_sg); 731 BUG_ON(!nents);
754 732
755 ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, 733 ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
756 SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); 734 gfp_mask, scsi_sg_alloc);
757 if (unlikely(ret)) 735 if (unlikely(ret))
758 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, 736 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
759 scsi_sg_free); 737 scsi_sg_free);
760 738
761 cmd->request_buffer = cmd->sg_table.sgl;
762 return ret; 739 return ret;
763} 740}
764 741
765EXPORT_SYMBOL(scsi_alloc_sgtable); 742static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
766
767void scsi_free_sgtable(struct scsi_cmnd *cmd)
768{ 743{
769 __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); 744 __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
770} 745}
771 746
772EXPORT_SYMBOL(scsi_free_sgtable);
773
774/* 747/*
775 * Function: scsi_release_buffers() 748 * Function: scsi_release_buffers()
776 * 749 *
@@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable);
788 * the scatter-gather table, and potentially any bounce 761 * the scatter-gather table, and potentially any bounce
789 * buffers. 762 * buffers.
790 */ 763 */
791static void scsi_release_buffers(struct scsi_cmnd *cmd) 764void scsi_release_buffers(struct scsi_cmnd *cmd)
765{
766 if (cmd->sdb.table.nents)
767 scsi_free_sgtable(&cmd->sdb);
768
769 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
770
771 if (scsi_bidi_cmnd(cmd)) {
772 struct scsi_data_buffer *bidi_sdb =
773 cmd->request->next_rq->special;
774 scsi_free_sgtable(bidi_sdb);
775 kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
776 cmd->request->next_rq->special = NULL;
777 }
778}
779EXPORT_SYMBOL(scsi_release_buffers);
780
781/*
782 * Bidi commands Must be complete as a whole, both sides at once.
783 * If part of the bytes were written and lld returned
784 * scsi_in()->resid and/or scsi_out()->resid this information will be left
785 * in req->data_len and req->next_rq->data_len. The upper-layer driver can
786 * decide what to do with this information.
787 */
788void scsi_end_bidi_request(struct scsi_cmnd *cmd)
792{ 789{
793 if (cmd->use_sg) 790 struct request *req = cmd->request;
794 scsi_free_sgtable(cmd); 791 unsigned int dlen = req->data_len;
792 unsigned int next_dlen = req->next_rq->data_len;
793
794 req->data_len = scsi_out(cmd)->resid;
795 req->next_rq->data_len = scsi_in(cmd)->resid;
796
797 /* The req and req->next_rq have not been completed */
798 BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
799
800 scsi_release_buffers(cmd);
795 801
796 /* 802 /*
797 * Zero these out. They now point to freed memory, and it is 803 * This will goose the queue request function at the end, so we don't
798 * dangerous to hang onto the pointers. 804 * need to worry about launching another command.
799 */ 805 */
800 cmd->request_buffer = NULL; 806 scsi_next_command(cmd);
801 cmd->request_bufflen = 0;
802} 807}
803 808
804/* 809/*
@@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
832void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) 837void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
833{ 838{
834 int result = cmd->result; 839 int result = cmd->result;
835 int this_count = cmd->request_bufflen; 840 int this_count = scsi_bufflen(cmd);
836 struct request_queue *q = cmd->device->request_queue; 841 struct request_queue *q = cmd->device->request_queue;
837 struct request *req = cmd->request; 842 struct request *req = cmd->request;
838 int clear_errors = 1; 843 int clear_errors = 1;
@@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
840 int sense_valid = 0; 845 int sense_valid = 0;
841 int sense_deferred = 0; 846 int sense_deferred = 0;
842 847
843 scsi_release_buffers(cmd);
844
845 if (result) { 848 if (result) {
846 sense_valid = scsi_command_normalize_sense(cmd, &sshdr); 849 sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
847 if (sense_valid) 850 if (sense_valid)
@@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
864 req->sense_len = len; 867 req->sense_len = len;
865 } 868 }
866 } 869 }
867 req->data_len = cmd->resid; 870 if (scsi_bidi_cmnd(cmd)) {
871 /* will also release_buffers */
872 scsi_end_bidi_request(cmd);
873 return;
874 }
875 req->data_len = scsi_get_resid(cmd);
868 } 876 }
869 877
878 BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
879 scsi_release_buffers(cmd);
880
870 /* 881 /*
871 * Next deal with any sectors which we were able to correctly 882 * Next deal with any sectors which we were able to correctly
872 * handle. 883 * handle.
@@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
874 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " 885 SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
875 "%d bytes done.\n", 886 "%d bytes done.\n",
876 req->nr_sectors, good_bytes)); 887 req->nr_sectors, good_bytes));
877 SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
878 888
879 if (clear_errors) 889 if (clear_errors)
880 req->errors = 0; 890 req->errors = 0;
@@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
991 scsi_end_request(cmd, -EIO, this_count, !result); 1001 scsi_end_request(cmd, -EIO, this_count, !result);
992} 1002}
993 1003
994/* 1004static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
995 * Function: scsi_init_io() 1005 gfp_t gfp_mask)
996 *
997 * Purpose: SCSI I/O initialize function.
998 *
999 * Arguments: cmd - Command descriptor we wish to initialize
1000 *
1001 * Returns: 0 on success
1002 * BLKPREP_DEFER if the failure is retryable
1003 */
1004static int scsi_init_io(struct scsi_cmnd *cmd)
1005{ 1006{
1006 struct request *req = cmd->request; 1007 int count;
1007 int count;
1008
1009 /*
1010 * We used to not use scatter-gather for single segment request,
1011 * but now we do (it makes highmem I/O easier to support without
1012 * kmapping pages)
1013 */
1014 cmd->use_sg = req->nr_phys_segments;
1015 1008
1016 /* 1009 /*
1017 * If sg table allocation fails, requeue request later. 1010 * If sg table allocation fails, requeue request later.
1018 */ 1011 */
1019 if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { 1012 if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
1020 scsi_unprep_request(req); 1013 gfp_mask))) {
1021 return BLKPREP_DEFER; 1014 return BLKPREP_DEFER;
1022 } 1015 }
1023 1016
1024 req->buffer = NULL; 1017 req->buffer = NULL;
1025 if (blk_pc_request(req)) 1018 if (blk_pc_request(req))
1026 cmd->request_bufflen = req->data_len; 1019 sdb->length = req->data_len;
1027 else 1020 else
1028 cmd->request_bufflen = req->nr_sectors << 9; 1021 sdb->length = req->nr_sectors << 9;
1029 1022
1030 /* 1023 /*
1031 * Next, walk the list, and fill in the addresses and sizes of 1024 * Next, walk the list, and fill in the addresses and sizes of
1032 * each segment. 1025 * each segment.
1033 */ 1026 */
1034 count = blk_rq_map_sg(req->q, req, cmd->request_buffer); 1027 count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
1035 BUG_ON(count > cmd->use_sg); 1028 BUG_ON(count > sdb->table.nents);
1036 cmd->use_sg = count; 1029 sdb->table.nents = count;
1037 return BLKPREP_OK; 1030 return BLKPREP_OK;
1038} 1031}
1039 1032
1033/*
1034 * Function: scsi_init_io()
1035 *
1036 * Purpose: SCSI I/O initialize function.
1037 *
1038 * Arguments: cmd - Command descriptor we wish to initialize
1039 *
1040 * Returns: 0 on success
1041 * BLKPREP_DEFER if the failure is retryable
1042 * BLKPREP_KILL if the failure is fatal
1043 */
1044int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
1045{
1046 int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
1047 if (error)
1048 goto err_exit;
1049
1050 if (blk_bidi_rq(cmd->request)) {
1051 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
1052 scsi_bidi_sdb_cache, GFP_ATOMIC);
1053 if (!bidi_sdb) {
1054 error = BLKPREP_DEFER;
1055 goto err_exit;
1056 }
1057
1058 cmd->request->next_rq->special = bidi_sdb;
1059 error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
1060 GFP_ATOMIC);
1061 if (error)
1062 goto err_exit;
1063 }
1064
1065 return BLKPREP_OK ;
1066
1067err_exit:
1068 scsi_release_buffers(cmd);
1069 if (error == BLKPREP_KILL)
1070 scsi_put_command(cmd);
1071 else /* BLKPREP_DEFER */
1072 scsi_unprep_request(cmd->request);
1073
1074 return error;
1075}
1076EXPORT_SYMBOL(scsi_init_io);
1077
1040static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, 1078static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
1041 struct request *req) 1079 struct request *req)
1042{ 1080{
@@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
1081 1119
1082 BUG_ON(!req->nr_phys_segments); 1120 BUG_ON(!req->nr_phys_segments);
1083 1121
1084 ret = scsi_init_io(cmd); 1122 ret = scsi_init_io(cmd, GFP_ATOMIC);
1085 if (unlikely(ret)) 1123 if (unlikely(ret))
1086 return ret; 1124 return ret;
1087 } else { 1125 } else {
1088 BUG_ON(req->data_len); 1126 BUG_ON(req->data_len);
1089 BUG_ON(req->data); 1127 BUG_ON(req->data);
1090 1128
1091 cmd->request_bufflen = 0; 1129 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
1092 cmd->request_buffer = NULL;
1093 cmd->use_sg = 0;
1094 req->buffer = NULL; 1130 req->buffer = NULL;
1095 } 1131 }
1096 1132
@@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
1132 if (unlikely(!cmd)) 1168 if (unlikely(!cmd))
1133 return BLKPREP_DEFER; 1169 return BLKPREP_DEFER;
1134 1170
1135 return scsi_init_io(cmd); 1171 return scsi_init_io(cmd, GFP_ATOMIC);
1136} 1172}
1137EXPORT_SYMBOL(scsi_setup_fs_cmnd); 1173EXPORT_SYMBOL(scsi_setup_fs_cmnd);
1138 1174
@@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
1542 * this limit is imposed by hardware restrictions 1578 * this limit is imposed by hardware restrictions
1543 */ 1579 */
1544 blk_queue_max_hw_segments(q, shost->sg_tablesize); 1580 blk_queue_max_hw_segments(q, shost->sg_tablesize);
1545 1581 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1546 /*
1547 * In the future, sg chaining support will be mandatory and this
1548 * ifdef can then go away. Right now we don't have all archs
1549 * converted, so better keep it safe.
1550 */
1551#ifdef ARCH_HAS_SG_CHAIN
1552 if (shost->use_sg_chaining)
1553 blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
1554 else
1555 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1556#else
1557 blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
1558#endif
1559 1582
1560 blk_queue_max_sectors(q, shost->max_sectors); 1583 blk_queue_max_sectors(q, shost->max_sectors);
1561 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); 1584 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void)
1654 return -ENOMEM; 1677 return -ENOMEM;
1655 } 1678 }
1656 1679
1680 scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
1681 sizeof(struct scsi_data_buffer),
1682 0, 0, NULL);
1683 if (!scsi_bidi_sdb_cache) {
1684 printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
1685 goto cleanup_io_context;
1686 }
1687
1657 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1688 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1658 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1689 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1659 int size = sgp->size * sizeof(struct scatterlist); 1690 int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void)
1663 if (!sgp->slab) { 1694 if (!sgp->slab) {
1664 printk(KERN_ERR "SCSI: can't init sg slab %s\n", 1695 printk(KERN_ERR "SCSI: can't init sg slab %s\n",
1665 sgp->name); 1696 sgp->name);
1697 goto cleanup_bidi_sdb;
1666 } 1698 }
1667 1699
1668 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, 1700 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void)
1670 if (!sgp->pool) { 1702 if (!sgp->pool) {
1671 printk(KERN_ERR "SCSI: can't init sg mempool %s\n", 1703 printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
1672 sgp->name); 1704 sgp->name);
1705 goto cleanup_bidi_sdb;
1673 } 1706 }
1674 } 1707 }
1675 1708
1676 return 0; 1709 return 0;
1710
1711cleanup_bidi_sdb:
1712 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1713 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
1714 if (sgp->pool)
1715 mempool_destroy(sgp->pool);
1716 if (sgp->slab)
1717 kmem_cache_destroy(sgp->slab);
1718 }
1719 kmem_cache_destroy(scsi_bidi_sdb_cache);
1720cleanup_io_context:
1721 kmem_cache_destroy(scsi_io_context_cache);
1722
1723 return -ENOMEM;
1677} 1724}
1678 1725
1679void scsi_exit_queue(void) 1726void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@ void scsi_exit_queue(void)
1681 int i; 1728 int i;
1682 1729
1683 kmem_cache_destroy(scsi_io_context_cache); 1730 kmem_cache_destroy(scsi_io_context_cache);
1731 kmem_cache_destroy(scsi_bidi_sdb_cache);
1684 1732
1685 for (i = 0; i < SG_MEMPOOL_NR; i++) { 1733 for (i = 0; i < SG_MEMPOOL_NR; i++) {
1686 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; 1734 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 01e03f3f6ffa..91630baea532 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
331 331
332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); 332 scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
333 333
334 if (scsi_sglist(cmd)) 334 scsi_release_buffers(cmd);
335 scsi_free_sgtable(cmd);
336 335
337 queue_work(scsi_tgtd, &tcmd->work); 336 queue_work(scsi_tgtd, &tcmd->work);
338} 337}
@@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
353 return 0; 352 return 0;
354} 353}
355 354
356static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
357{
358 struct request *rq = cmd->request;
359 int count;
360
361 cmd->use_sg = rq->nr_phys_segments;
362 if (scsi_alloc_sgtable(cmd, gfp_mask))
363 return -ENOMEM;
364
365 cmd->request_bufflen = rq->data_len;
366
367 dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
368 rq_data_dir(rq));
369 count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
370 BUG_ON(count > cmd->use_sg);
371 cmd->use_sg = count;
372 return 0;
373}
374
375/* TODO: test this crap and replace bio_map_user with new interface maybe */ 355/* TODO: test this crap and replace bio_map_user with new interface maybe */
376static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, 356static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
377 unsigned long uaddr, unsigned int len, int rw) 357 unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
397 } 377 }
398 378
399 tcmd->bio = rq->bio; 379 tcmd->bio = rq->bio;
400 err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); 380 err = scsi_init_io(cmd, GFP_KERNEL);
401 if (err) 381 if (err) {
382 scsi_release_buffers(cmd);
402 goto unmap_rq; 383 goto unmap_rq;
384 }
403 385
404 return 0; 386 return 0;
405 387
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 24eba3118b5a..51a5557f42dd 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
519 SCpnt->cmnd[4] = (unsigned char) this_count; 519 SCpnt->cmnd[4] = (unsigned char) this_count;
520 SCpnt->cmnd[5] = 0; 520 SCpnt->cmnd[5] = 0;
521 } 521 }
522 SCpnt->request_bufflen = this_count * sdp->sector_size; 522 SCpnt->sdb.length = this_count * sdp->sector_size;
523 523
524 /* 524 /*
525 * We shouldn't disconnect in the middle of a sector, so with a dumb 525 * We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
926static int sd_done(struct scsi_cmnd *SCpnt) 926static int sd_done(struct scsi_cmnd *SCpnt)
927{ 927{
928 int result = SCpnt->result; 928 int result = SCpnt->result;
929 unsigned int xfer_size = SCpnt->request_bufflen; 929 unsigned int xfer_size = scsi_bufflen(SCpnt);
930 unsigned int good_bytes = result ? 0 : xfer_size; 930 unsigned int good_bytes = result ? 0 : xfer_size;
931 u64 start_lba = SCpnt->request->sector; 931 u64 start_lba = SCpnt->request->sector;
932 u64 bad_lba; 932 u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c
index d4ebe8c67ba9..26cfc56c7091 100644
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
33 33
34struct ip22_hostdata { 34struct ip22_hostdata {
35 struct WD33C93_hostdata wh; 35 struct WD33C93_hostdata wh;
36 struct hpc_data { 36 dma_addr_t dma;
37 dma_addr_t dma; 37 void *cpu;
38 void *cpu; 38 struct device *dev;
39 } hd;
40}; 39};
41 40
42#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) 41#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@ struct hpc_chunk {
46 u32 _padding; /* align to quadword boundary */ 45 u32 _padding; /* align to quadword boundary */
47}; 46};
48 47
48/* space for hpc dma descriptors */
49#define HPC_DMA_SIZE PAGE_SIZE
50
51#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
52
49static irqreturn_t sgiwd93_intr(int irq, void *dev_id) 53static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
50{ 54{
51 struct Scsi_Host * host = dev_id; 55 struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
59} 63}
60 64
61static inline 65static inline
62void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) 66void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
63{ 67{
64 unsigned long len = cmd->SCp.this_residual; 68 unsigned long len = cmd->SCp.this_residual;
65 void *addr = cmd->SCp.ptr; 69 void *addr = cmd->SCp.ptr;
66 dma_addr_t physaddr; 70 dma_addr_t physaddr;
67 unsigned long count; 71 unsigned long count;
72 struct hpc_chunk *hcp;
68 73
69 physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); 74 physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
70 cmd->SCp.dma_handle = physaddr; 75 cmd->SCp.dma_handle = physaddr;
76 hcp = hd->cpu;
71 77
72 while (len) { 78 while (len) {
73 /* 79 /*
@@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
89 */ 95 */
90 hcp->desc.pbuf = 0; 96 hcp->desc.pbuf = 0;
91 hcp->desc.cntinfo = HPCDMA_EOX; 97 hcp->desc.cntinfo = HPCDMA_EOX;
98 dma_cache_sync(hd->dev, hd->cpu,
99 (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
100 DMA_TO_DEVICE);
92} 101}
93 102
94static int dma_setup(struct scsi_cmnd *cmd, int datainp) 103static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
96 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); 105 struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
97 struct hpc3_scsiregs *hregs = 106 struct hpc3_scsiregs *hregs =
98 (struct hpc3_scsiregs *) cmd->device->host->base; 107 (struct hpc3_scsiregs *) cmd->device->host->base;
99 struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
100 108
101 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); 109 pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
102 110
103 hdata->wh.dma_dir = datainp; 111 hdata->wh.dma_dir = datainp;
104 112
@@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
111 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) 119 if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
112 return 1; 120 return 1;
113 121
114 fill_hpc_entries(hcp, cmd, datainp); 122 fill_hpc_entries(hdata, cmd, datainp);
115 123
116 pr_debug(" HPCGO\n"); 124 pr_debug(" HPCGO\n");
117 125
118 /* Start up the HPC. */ 126 /* Start up the HPC. */
119 hregs->ndptr = hdata->hd.dma; 127 hregs->ndptr = hdata->dma;
120 if (datainp) 128 if (datainp)
121 hregs->ctrl = HPC3_SCTRL_ACTIVE; 129 hregs->ctrl = HPC3_SCTRL_ACTIVE;
122 else 130 else
@@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
134 if (!SCpnt) 142 if (!SCpnt)
135 return; 143 return;
136 144
145 if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
146 return;
147
137 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; 148 hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
138 149
139 pr_debug("dma_stop: status<%d> ", status); 150 pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
145 barrier(); 156 barrier();
146 } 157 }
147 hregs->ctrl = 0; 158 hregs->ctrl = 0;
148 dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, 159 dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
149 SCpnt->sc_data_direction); 160 SCpnt->SCp.this_residual,
161 DMA_DIR(hdata->wh.dma_dir));
150 162
151 pr_debug("\n"); 163 pr_debug("\n");
152} 164}
@@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base)
161} 173}
162EXPORT_SYMBOL_GPL(sgiwd93_reset); 174EXPORT_SYMBOL_GPL(sgiwd93_reset);
163 175
164static inline void init_hpc_chain(struct hpc_data *hd) 176static inline void init_hpc_chain(struct ip22_hostdata *hdata)
165{ 177{
166 struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; 178 struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
167 struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; 179 dma_addr_t dma = hdata->dma;
168 unsigned long start, end; 180 unsigned long start, end;
169 181
170 start = (unsigned long) hcp; 182 start = (unsigned long) hcp;
171 end = start + PAGE_SIZE; 183 end = start + HPC_DMA_SIZE;
172 while (start < end) { 184 while (start < end) {
173 hcp->desc.pnext = (u32) (dma + 1); 185 hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
174 hcp->desc.cntinfo = HPCDMA_EOX; 186 hcp->desc.cntinfo = HPCDMA_EOX;
175 hcp++; dma++; 187 hcp++;
188 dma += sizeof(struct hpc_chunk);
176 start += sizeof(struct hpc_chunk); 189 start += sizeof(struct hpc_chunk);
177 }; 190 };
178 hcp--; 191 hcp--;
179 hcp->desc.pnext = hd->dma; 192 hcp->desc.pnext = hdata->dma;
180} 193}
181 194
182static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) 195static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
235 host->irq = irq; 248 host->irq = irq;
236 249
237 hdata = host_to_hostdata(host); 250 hdata = host_to_hostdata(host);
238 hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, 251 hdata->dev = &pdev->dev;
239 &hdata->hd.dma, GFP_KERNEL); 252 hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
240 if (!hdata->hd.cpu) { 253 &hdata->dma, GFP_KERNEL);
254 if (!hdata->cpu) {
241 printk(KERN_WARNING "sgiwd93: Could not allocate memory for " 255 printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
242 "host %d buffer.\n", unit); 256 "host %d buffer.\n", unit);
243 err = -ENOMEM; 257 err = -ENOMEM;
244 goto out_put; 258 goto out_put;
245 } 259 }
246 260
247 init_hpc_chain(&hdata->hd); 261 init_hpc_chain(hdata);
248 262
249 regs.SASR = wdregs + 3; 263 regs.SASR = wdregs + 3;
250 regs.SCMD = wdregs + 7; 264 regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
274out_irq: 288out_irq:
275 free_irq(irq, host); 289 free_irq(irq, host);
276out_free: 290out_free:
277 dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 291 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
278out_put: 292out_put:
279 scsi_host_put(host); 293 scsi_host_put(host);
280out: 294out:
@@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
290 304
291 scsi_remove_host(host); 305 scsi_remove_host(host);
292 free_irq(pd->irq, host); 306 free_irq(pd->irq, host);
293 dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); 307 dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
294 scsi_host_put(host); 308 scsi_host_put(host);
295} 309}
296 310
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1fcee16fa36d..50ba49250203 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ out:
231static int sr_done(struct scsi_cmnd *SCpnt) 231static int sr_done(struct scsi_cmnd *SCpnt)
232{ 232{
233 int result = SCpnt->result; 233 int result = SCpnt->result;
234 int this_count = SCpnt->request_bufflen; 234 int this_count = scsi_bufflen(SCpnt);
235 int good_bytes = (result == 0 ? this_count : 0); 235 int good_bytes = (result == 0 ? this_count : 0);
236 int block_sectors = 0; 236 int block_sectors = 0;
237 long error_sector; 237 long error_sector;
@@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
379 } 379 }
380 380
381 { 381 {
382 struct scatterlist *sg = SCpnt->request_buffer; 382 struct scatterlist *sg;
383 int i, size = 0; 383 int i, size = 0, sg_count = scsi_sg_count(SCpnt);
384 for (i = 0; i < SCpnt->use_sg; i++)
385 size += sg[i].length;
386 384
387 if (size != SCpnt->request_bufflen && SCpnt->use_sg) { 385 scsi_for_each_sg(SCpnt, sg, sg_count, i)
386 size += sg->length;
387
388 if (size != scsi_bufflen(SCpnt)) {
388 scmd_printk(KERN_ERR, SCpnt, 389 scmd_printk(KERN_ERR, SCpnt,
389 "mismatch count %d, bytes %d\n", 390 "mismatch count %d, bytes %d\n",
390 size, SCpnt->request_bufflen); 391 size, scsi_bufflen(SCpnt));
391 if (SCpnt->request_bufflen > size) 392 if (scsi_bufflen(SCpnt) > size)
392 SCpnt->request_bufflen = size; 393 SCpnt->sdb.length = size;
393 } 394 }
394 } 395 }
395 396
@@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
397 * request doesn't start on hw block boundary, add scatter pads 398 * request doesn't start on hw block boundary, add scatter pads
398 */ 399 */
399 if (((unsigned int)rq->sector % (s_size >> 9)) || 400 if (((unsigned int)rq->sector % (s_size >> 9)) ||
400 (SCpnt->request_bufflen % s_size)) { 401 (scsi_bufflen(SCpnt) % s_size)) {
401 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); 402 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
402 goto out; 403 goto out;
403 } 404 }
404 405
405 this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); 406 this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
406 407
407 408
408 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", 409 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
416 417
417 if (this_count > 0xffff) { 418 if (this_count > 0xffff) {
418 this_count = 0xffff; 419 this_count = 0xffff;
419 SCpnt->request_bufflen = this_count * s_size; 420 SCpnt->sdb.length = this_count * s_size;
420 } 421 }
421 422
422 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; 423 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index e3fab3a6aed7..72f6d8015358 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = {
1123 .this_id = -1, 1123 .this_id = -1,
1124 .sg_tablesize = ST_MAX_SG, 1124 .sg_tablesize = ST_MAX_SG,
1125 .cmd_per_lun = ST_CMD_PER_LUN, 1125 .cmd_per_lun = ST_CMD_PER_LUN,
1126 .use_sg_chaining = ENABLE_SG_CHAINING,
1127}; 1126};
1128 1127
1129static int stex_set_dma_mask(struct pci_dev * pdev) 1128static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 1f6fd1680335..6325901e5093 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = {
840 .cmd_per_lun = 1, 840 .cmd_per_lun = 1,
841 .unchecked_isa_dma = 1, 841 .unchecked_isa_dma = 1,
842 .use_clustering = ENABLE_CLUSTERING, 842 .use_clustering = ENABLE_CLUSTERING,
843 .use_sg_chaining = ENABLE_SG_CHAINING,
844}; 843};
845#include "scsi_module.c" 844#include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 21e926dcdab0..d39107b7669b 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
207 /* 207 /*
208 * Bounce back the sense data to user. 208 * Bounce back the sense data to user.
209 */ 209 */
210 memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 210 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
211 memcpy(cmd->sense_buffer, cp->sns_bbuf, 211 memcpy(cmd->sense_buffer, cp->sns_bbuf,
212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); 212 min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
213#if 0 213#if 0
@@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = {
1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, 1681 .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler,
1682 .this_id = 7, 1682 .this_id = 7,
1683 .use_clustering = ENABLE_CLUSTERING, 1683 .use_clustering = ENABLE_CLUSTERING,
1684 .use_sg_chaining = ENABLE_SG_CHAINING,
1685 .max_sectors = 0xFFFF, 1684 .max_sectors = 0xFFFF,
1686#ifdef SYM_LINUX_PROC_INFO_SUPPORT 1685#ifdef SYM_LINUX_PROC_INFO_SUPPORT
1687 .proc_info = sym53c8xx_proc_info, 1686 .proc_info = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 4bc5407f9695..662c00451be4 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = {
451 .this_id = 7, 451 .this_id = 7,
452 .unchecked_isa_dma = 1, 452 .unchecked_isa_dma = 1,
453 .use_clustering = ENABLE_CLUSTERING, 453 .use_clustering = ENABLE_CLUSTERING,
454 .use_sg_chaining = ENABLE_SG_CHAINING,
455 }; 454 };
456 455
457#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) 456#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index 75eca6b22db5..f385dce8dfbe 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = {
1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, 1204 .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN,
1205 .unchecked_isa_dma = 1, 1205 .unchecked_isa_dma = 1,
1206 .use_clustering = ENABLE_CLUSTERING, 1206 .use_clustering = ENABLE_CLUSTERING,
1207 .use_sg_chaining = ENABLE_SG_CHAINING,
1208}; 1207};
1209#include "scsi_module.c" 1208#include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index b4304ae78527..c975c01b3a02 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = {
1671 .cmd_per_lun = 1, 1671 .cmd_per_lun = 1,
1672 .unchecked_isa_dma = 1, 1672 .unchecked_isa_dma = 1,
1673 .use_clustering = ENABLE_CLUSTERING, 1673 .use_clustering = ENABLE_CLUSTERING,
1674 .use_sg_chaining = ENABLE_SG_CHAINING,
1675}; 1674};
1676 1675
1677#include "scsi_module.c" 1676#include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 178e8c2a8a2f..0db488624ab1 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
415 sg_init_one(&info->sg, buff, bufflen); 415 sg_init_one(&info->sg, buff, bufflen);
416 416
417 srb->sc_data_direction = dir; 417 srb->sc_data_direction = dir;
418 srb->request_buffer = buff ? &info->sg : NULL; 418 srb->sdb.table.sgl = buff ? &info->sg : NULL;
419 srb->request_bufflen = bufflen; 419 srb->sdb.length = bufflen;
420 srb->use_sg = buff ? 1 : 0; 420 srb->sdb.table.nents = buff ? 1 : 0;
421} 421}
422 422
423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) 423static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
424{ 424{
425 srb->request_bufflen = bufflen; 425 srb->sdb.length = bufflen;
426} 426}
427 427
428 428
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index c31f549ebea0..1c656667b937 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -88,9 +88,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
88{ 88{
89 gfp_t flags; 89 gfp_t flags;
90 unsigned long i; 90 unsigned long i;
91 pgprot_t wc_pageprot;
92 91
93 wc_pageprot = PAGE_KERNEL_NOCACHE;
94 max_order++; 92 max_order++;
95 do { 93 do {
96 /* 94 /*
@@ -126,14 +124,8 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
126 /* 124 /*
127 * Change caching policy of the linear kernel map to avoid 125 * Change caching policy of the linear kernel map to avoid
128 * mapping type conflicts with user-space mappings. 126 * mapping type conflicts with user-space mappings.
129 * The first global_flush_tlb() is really only there to do a global
130 * wbinvd().
131 */ 127 */
132 128 set_pages_uc(virt_to_page(va->logical), va->size >> PAGE_SHIFT);
133 global_flush_tlb();
134 change_page_attr(virt_to_page(va->logical), va->size >> PAGE_SHIFT,
135 wc_pageprot);
136 global_flush_tlb();
137 129
138 printk(KERN_DEBUG MODULE_NAME 130 printk(KERN_DEBUG MODULE_NAME
139 ": Allocated %ld bytes vram area at 0x%08lx\n", 131 ": Allocated %ld bytes vram area at 0x%08lx\n",
@@ -157,9 +149,8 @@ static void vmlfb_free_vram_area(struct vram_area *va)
157 * Reset the linear kernel map caching policy. 149 * Reset the linear kernel map caching policy.
158 */ 150 */
159 151
160 change_page_attr(virt_to_page(va->logical), 152 set_pages_wb(virt_to_page(va->logical),
161 va->size >> PAGE_SHIFT, PAGE_KERNEL); 153 va->size >> PAGE_SHIFT);
162 global_flush_tlb();
163 154
164 /* 155 /*
165 * Decrease the usage count on the pages we've used 156 * Decrease the usage count on the pages we've used
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index d4fc6095466d..7c3d5f923da1 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -23,6 +23,10 @@ config BINFMT_ELF
23 ld.so (check the file <file:Documentation/Changes> for location and 23 ld.so (check the file <file:Documentation/Changes> for location and
24 latest version). 24 latest version).
25 25
26config COMPAT_BINFMT_ELF
27 bool
28 depends on COMPAT && MMU
29
26config BINFMT_ELF_FDPIC 30config BINFMT_ELF_FDPIC
27 bool "Kernel support for FDPIC ELF binaries" 31 bool "Kernel support for FDPIC ELF binaries"
28 default y 32 default y
diff --git a/fs/Makefile b/fs/Makefile
index 500cf15cdb4b..1e7a11bd4da1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
39obj-y += binfmt_script.o 39obj-y += binfmt_script.o
40 40
41obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o 41obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
42obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
42obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 43obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
43obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o 44obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
44obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 45obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
diff --git a/fs/aio.c b/fs/aio.c
index 9dec7d2d546e..8a37dbbf3437 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -397,7 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
397 * This prevents races between the aio code path referencing the 397 * This prevents races between the aio code path referencing the
398 * req (after submitting it) and aio_complete() freeing the req. 398 * req (after submitting it) and aio_complete() freeing the req.
399 */ 399 */
400static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx)); 400static struct kiocb *__aio_get_req(struct kioctx *ctx);
401static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) 401static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
402{ 402{
403 struct kiocb *req = NULL; 403 struct kiocb *req = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f0b3171842f2..18ed6dd906c1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
45 45
46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
47static int load_elf_library(struct file *); 47static int load_elf_library(struct file *);
48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); 48static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
49 int, int, unsigned long);
49 50
50/* 51/*
51 * If we don't support core dumping, then supply a NULL so we 52 * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +299,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
298#ifndef elf_map 299#ifndef elf_map
299 300
300static unsigned long elf_map(struct file *filep, unsigned long addr, 301static unsigned long elf_map(struct file *filep, unsigned long addr,
301 struct elf_phdr *eppnt, int prot, int type) 302 struct elf_phdr *eppnt, int prot, int type,
303 unsigned long total_size)
302{ 304{
303 unsigned long map_addr; 305 unsigned long map_addr;
304 unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); 306 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
307 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
308 addr = ELF_PAGESTART(addr);
309 size = ELF_PAGEALIGN(size);
305 310
306 down_write(&current->mm->mmap_sem);
307 /* mmap() will return -EINVAL if given a zero size, but a 311 /* mmap() will return -EINVAL if given a zero size, but a
308 * segment with zero filesize is perfectly valid */ 312 * segment with zero filesize is perfectly valid */
309 if (eppnt->p_filesz + pageoffset) 313 if (!size)
310 map_addr = do_mmap(filep, ELF_PAGESTART(addr), 314 return addr;
311 eppnt->p_filesz + pageoffset, prot, type, 315
312 eppnt->p_offset - pageoffset); 316 down_write(&current->mm->mmap_sem);
313 else 317 /*
314 map_addr = ELF_PAGESTART(addr); 318 * total_size is the size of the ELF (interpreter) image.
319 * The _first_ mmap needs to know the full size, otherwise
320 * randomization might put this image into an overlapping
321 * position with the ELF binary image. (since size < total_size)
322 * So we first map the 'big' image - and unmap the remainder at
323 * the end. (which unmap is needed for ELF images with holes.)
324 */
325 if (total_size) {
326 total_size = ELF_PAGEALIGN(total_size);
327 map_addr = do_mmap(filep, addr, total_size, prot, type, off);
328 if (!BAD_ADDR(map_addr))
329 do_munmap(current->mm, map_addr+size, total_size-size);
330 } else
331 map_addr = do_mmap(filep, addr, size, prot, type, off);
332
315 up_write(&current->mm->mmap_sem); 333 up_write(&current->mm->mmap_sem);
316 return(map_addr); 334 return(map_addr);
317} 335}
318 336
319#endif /* !elf_map */ 337#endif /* !elf_map */
320 338
339static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
340{
341 int i, first_idx = -1, last_idx = -1;
342
343 for (i = 0; i < nr; i++) {
344 if (cmds[i].p_type == PT_LOAD) {
345 last_idx = i;
346 if (first_idx == -1)
347 first_idx = i;
348 }
349 }
350 if (first_idx == -1)
351 return 0;
352
353 return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
354 ELF_PAGESTART(cmds[first_idx].p_vaddr);
355}
356
357
321/* This is much more generalized than the library routine read function, 358/* This is much more generalized than the library routine read function,
322 so we keep this separate. Technically the library read function 359 so we keep this separate. Technically the library read function
323 is only provided so that we can read a.out libraries that have 360 is only provided so that we can read a.out libraries that have
324 an ELF header */ 361 an ELF header */
325 362
326static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, 363static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
327 struct file *interpreter, unsigned long *interp_load_addr) 364 struct file *interpreter, unsigned long *interp_map_addr,
365 unsigned long no_base)
328{ 366{
329 struct elf_phdr *elf_phdata; 367 struct elf_phdr *elf_phdata;
330 struct elf_phdr *eppnt; 368 struct elf_phdr *eppnt;
@@ -332,6 +370,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
332 int load_addr_set = 0; 370 int load_addr_set = 0;
333 unsigned long last_bss = 0, elf_bss = 0; 371 unsigned long last_bss = 0, elf_bss = 0;
334 unsigned long error = ~0UL; 372 unsigned long error = ~0UL;
373 unsigned long total_size;
335 int retval, i, size; 374 int retval, i, size;
336 375
337 /* First of all, some simple consistency checks */ 376 /* First of all, some simple consistency checks */
@@ -370,6 +409,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
370 goto out_close; 409 goto out_close;
371 } 410 }
372 411
412 total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
413 if (!total_size) {
414 error = -EINVAL;
415 goto out_close;
416 }
417
373 eppnt = elf_phdata; 418 eppnt = elf_phdata;
374 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { 419 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
375 if (eppnt->p_type == PT_LOAD) { 420 if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +432,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
387 vaddr = eppnt->p_vaddr; 432 vaddr = eppnt->p_vaddr;
388 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) 433 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
389 elf_type |= MAP_FIXED; 434 elf_type |= MAP_FIXED;
435 else if (no_base && interp_elf_ex->e_type == ET_DYN)
436 load_addr = -vaddr;
390 437
391 map_addr = elf_map(interpreter, load_addr + vaddr, 438 map_addr = elf_map(interpreter, load_addr + vaddr,
392 eppnt, elf_prot, elf_type); 439 eppnt, elf_prot, elf_type, total_size);
440 total_size = 0;
441 if (!*interp_map_addr)
442 *interp_map_addr = map_addr;
393 error = map_addr; 443 error = map_addr;
394 if (BAD_ADDR(map_addr)) 444 if (BAD_ADDR(map_addr))
395 goto out_close; 445 goto out_close;
@@ -455,8 +505,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
455 goto out_close; 505 goto out_close;
456 } 506 }
457 507
458 *interp_load_addr = load_addr; 508 error = load_addr;
459 error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
460 509
461out_close: 510out_close:
462 kfree(elf_phdata); 511 kfree(elf_phdata);
@@ -546,14 +595,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
546 int load_addr_set = 0; 595 int load_addr_set = 0;
547 char * elf_interpreter = NULL; 596 char * elf_interpreter = NULL;
548 unsigned int interpreter_type = INTERPRETER_NONE; 597 unsigned int interpreter_type = INTERPRETER_NONE;
549 unsigned char ibcs2_interpreter = 0;
550 unsigned long error; 598 unsigned long error;
551 struct elf_phdr *elf_ppnt, *elf_phdata; 599 struct elf_phdr *elf_ppnt, *elf_phdata;
552 unsigned long elf_bss, elf_brk; 600 unsigned long elf_bss, elf_brk;
553 int elf_exec_fileno; 601 int elf_exec_fileno;
554 int retval, i; 602 int retval, i;
555 unsigned int size; 603 unsigned int size;
556 unsigned long elf_entry, interp_load_addr = 0; 604 unsigned long elf_entry;
605 unsigned long interp_load_addr = 0;
557 unsigned long start_code, end_code, start_data, end_data; 606 unsigned long start_code, end_code, start_data, end_data;
558 unsigned long reloc_func_desc = 0; 607 unsigned long reloc_func_desc = 0;
559 char passed_fileno[6]; 608 char passed_fileno[6];
@@ -663,14 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
663 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') 712 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
664 goto out_free_interp; 713 goto out_free_interp;
665 714
666 /* If the program interpreter is one of these two,
667 * then assume an iBCS2 image. Otherwise assume
668 * a native linux image.
669 */
670 if (strcmp(elf_interpreter,"/usr/lib/libc.so.1") == 0 ||
671 strcmp(elf_interpreter,"/usr/lib/ld.so.1") == 0)
672 ibcs2_interpreter = 1;
673
674 /* 715 /*
675 * The early SET_PERSONALITY here is so that the lookup 716 * The early SET_PERSONALITY here is so that the lookup
676 * for the interpreter happens in the namespace of the 717 * for the interpreter happens in the namespace of the
@@ -690,7 +731,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
690 * switch really is going to happen - do this in 731 * switch really is going to happen - do this in
691 * flush_thread(). - akpm 732 * flush_thread(). - akpm
692 */ 733 */
693 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 734 SET_PERSONALITY(loc->elf_ex, 0);
694 735
695 interpreter = open_exec(elf_interpreter); 736 interpreter = open_exec(elf_interpreter);
696 retval = PTR_ERR(interpreter); 737 retval = PTR_ERR(interpreter);
@@ -769,7 +810,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
769 goto out_free_dentry; 810 goto out_free_dentry;
770 } else { 811 } else {
771 /* Executables without an interpreter also need a personality */ 812 /* Executables without an interpreter also need a personality */
772 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 813 SET_PERSONALITY(loc->elf_ex, 0);
773 } 814 }
774 815
775 /* OK, we are done with that, now set up the arg stuff, 816 /* OK, we are done with that, now set up the arg stuff,
@@ -803,7 +844,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
803 844
804 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 845 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
805 may depend on the personality. */ 846 may depend on the personality. */
806 SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); 847 SET_PERSONALITY(loc->elf_ex, 0);
807 if (elf_read_implies_exec(loc->elf_ex, executable_stack)) 848 if (elf_read_implies_exec(loc->elf_ex, executable_stack))
808 current->personality |= READ_IMPLIES_EXEC; 849 current->personality |= READ_IMPLIES_EXEC;
809 850
@@ -825,9 +866,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
825 current->mm->start_stack = bprm->p; 866 current->mm->start_stack = bprm->p;
826 867
827 /* Now we do a little grungy work by mmaping the ELF image into 868 /* Now we do a little grungy work by mmaping the ELF image into
828 the correct location in memory. At this point, we assume that 869 the correct location in memory. */
829 the image should be loaded at fixed address, not at a variable
830 address. */
831 for(i = 0, elf_ppnt = elf_phdata; 870 for(i = 0, elf_ppnt = elf_phdata;
832 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 871 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
833 int elf_prot = 0, elf_flags; 872 int elf_prot = 0, elf_flags;
@@ -881,11 +920,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
881 * default mmap base, as well as whatever program they 920 * default mmap base, as well as whatever program they
882 * might try to exec. This is because the brk will 921 * might try to exec. This is because the brk will
883 * follow the loader, and is not movable. */ 922 * follow the loader, and is not movable. */
923#ifdef CONFIG_X86
924 load_bias = 0;
925#else
884 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 926 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
927#endif
885 } 928 }
886 929
887 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 930 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
888 elf_prot, elf_flags); 931 elf_prot, elf_flags, 0);
889 if (BAD_ADDR(error)) { 932 if (BAD_ADDR(error)) {
890 send_sig(SIGKILL, current, 0); 933 send_sig(SIGKILL, current, 0);
891 retval = IS_ERR((void *)error) ? 934 retval = IS_ERR((void *)error) ?
@@ -961,13 +1004,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
961 } 1004 }
962 1005
963 if (elf_interpreter) { 1006 if (elf_interpreter) {
964 if (interpreter_type == INTERPRETER_AOUT) 1007 if (interpreter_type == INTERPRETER_AOUT) {
965 elf_entry = load_aout_interp(&loc->interp_ex, 1008 elf_entry = load_aout_interp(&loc->interp_ex,
966 interpreter); 1009 interpreter);
967 else 1010 } else {
1011 unsigned long uninitialized_var(interp_map_addr);
1012
968 elf_entry = load_elf_interp(&loc->interp_elf_ex, 1013 elf_entry = load_elf_interp(&loc->interp_elf_ex,
969 interpreter, 1014 interpreter,
970 &interp_load_addr); 1015 &interp_map_addr,
1016 load_bias);
1017 if (!IS_ERR((void *)elf_entry)) {
1018 /*
1019 * load_elf_interp() returns relocation
1020 * adjustment
1021 */
1022 interp_load_addr = elf_entry;
1023 elf_entry += loc->interp_elf_ex.e_entry;
1024 }
1025 }
971 if (BAD_ADDR(elf_entry)) { 1026 if (BAD_ADDR(elf_entry)) {
972 force_sig(SIGSEGV, current); 1027 force_sig(SIGSEGV, current);
973 retval = IS_ERR((void *)elf_entry) ? 1028 retval = IS_ERR((void *)elf_entry) ?
@@ -1021,6 +1076,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
1021 current->mm->end_data = end_data; 1076 current->mm->end_data = end_data;
1022 current->mm->start_stack = bprm->p; 1077 current->mm->start_stack = bprm->p;
1023 1078
1079#ifdef arch_randomize_brk
1080 if (current->flags & PF_RANDOMIZE)
1081 current->mm->brk = current->mm->start_brk =
1082 arch_randomize_brk(current->mm);
1083#endif
1084
1024 if (current->personality & MMAP_PAGE_ZERO) { 1085 if (current->personality & MMAP_PAGE_ZERO) {
1025 /* Why this, you ask??? Well SVr4 maps page 0 as read-only, 1086 /* Why this, you ask??? Well SVr4 maps page 0 as read-only,
1026 and some applications "depend" upon this behavior. 1087 and some applications "depend" upon this behavior.
@@ -1325,7 +1386,8 @@ static int writenote(struct memelfnote *men, struct file *file,
1325 if (!dump_seek(file, (off))) \ 1386 if (!dump_seek(file, (off))) \
1326 goto end_coredump; 1387 goto end_coredump;
1327 1388
1328static void fill_elf_header(struct elfhdr *elf, int segs) 1389static void fill_elf_header(struct elfhdr *elf, int segs,
1390 u16 machine, u32 flags, u8 osabi)
1329{ 1391{
1330 memcpy(elf->e_ident, ELFMAG, SELFMAG); 1392 memcpy(elf->e_ident, ELFMAG, SELFMAG);
1331 elf->e_ident[EI_CLASS] = ELF_CLASS; 1393 elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1335,12 +1397,12 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
1335 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); 1397 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
1336 1398
1337 elf->e_type = ET_CORE; 1399 elf->e_type = ET_CORE;
1338 elf->e_machine = ELF_ARCH; 1400 elf->e_machine = machine;
1339 elf->e_version = EV_CURRENT; 1401 elf->e_version = EV_CURRENT;
1340 elf->e_entry = 0; 1402 elf->e_entry = 0;
1341 elf->e_phoff = sizeof(struct elfhdr); 1403 elf->e_phoff = sizeof(struct elfhdr);
1342 elf->e_shoff = 0; 1404 elf->e_shoff = 0;
1343 elf->e_flags = ELF_CORE_EFLAGS; 1405 elf->e_flags = flags;
1344 elf->e_ehsize = sizeof(struct elfhdr); 1406 elf->e_ehsize = sizeof(struct elfhdr);
1345 elf->e_phentsize = sizeof(struct elf_phdr); 1407 elf->e_phentsize = sizeof(struct elf_phdr);
1346 elf->e_phnum = segs; 1408 elf->e_phnum = segs;
@@ -1447,6 +1509,238 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1447 return 0; 1509 return 0;
1448} 1510}
1449 1511
1512static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
1513{
1514 elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
1515 int i = 0;
1516 do
1517 i += 2;
1518 while (auxv[i - 2] != AT_NULL);
1519 fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
1520}
1521
1522#ifdef CORE_DUMP_USE_REGSET
1523#include <linux/regset.h>
1524
1525struct elf_thread_core_info {
1526 struct elf_thread_core_info *next;
1527 struct task_struct *task;
1528 struct elf_prstatus prstatus;
1529 struct memelfnote notes[0];
1530};
1531
1532struct elf_note_info {
1533 struct elf_thread_core_info *thread;
1534 struct memelfnote psinfo;
1535 struct memelfnote auxv;
1536 size_t size;
1537 int thread_notes;
1538};
1539
1540static int fill_thread_core_info(struct elf_thread_core_info *t,
1541 const struct user_regset_view *view,
1542 long signr, size_t *total)
1543{
1544 unsigned int i;
1545
1546 /*
1547 * NT_PRSTATUS is the one special case, because the regset data
1548 * goes into the pr_reg field inside the note contents, rather
1549 * than being the whole note contents. We fill the reset in here.
1550 * We assume that regset 0 is NT_PRSTATUS.
1551 */
1552 fill_prstatus(&t->prstatus, t->task, signr);
1553 (void) view->regsets[0].get(t->task, &view->regsets[0],
1554 0, sizeof(t->prstatus.pr_reg),
1555 &t->prstatus.pr_reg, NULL);
1556
1557 fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
1558 sizeof(t->prstatus), &t->prstatus);
1559 *total += notesize(&t->notes[0]);
1560
1561 /*
1562 * Each other regset might generate a note too. For each regset
1563 * that has no core_note_type or is inactive, we leave t->notes[i]
1564 * all zero and we'll know to skip writing it later.
1565 */
1566 for (i = 1; i < view->n; ++i) {
1567 const struct user_regset *regset = &view->regsets[i];
1568 if (regset->core_note_type &&
1569 (!regset->active || regset->active(t->task, regset))) {
1570 int ret;
1571 size_t size = regset->n * regset->size;
1572 void *data = kmalloc(size, GFP_KERNEL);
1573 if (unlikely(!data))
1574 return 0;
1575 ret = regset->get(t->task, regset,
1576 0, size, data, NULL);
1577 if (unlikely(ret))
1578 kfree(data);
1579 else {
1580 if (regset->core_note_type != NT_PRFPREG)
1581 fill_note(&t->notes[i], "LINUX",
1582 regset->core_note_type,
1583 size, data);
1584 else {
1585 t->prstatus.pr_fpvalid = 1;
1586 fill_note(&t->notes[i], "CORE",
1587 NT_PRFPREG, size, data);
1588 }
1589 *total += notesize(&t->notes[i]);
1590 }
1591 }
1592 }
1593
1594 return 1;
1595}
1596
1597static int fill_note_info(struct elfhdr *elf, int phdrs,
1598 struct elf_note_info *info,
1599 long signr, struct pt_regs *regs)
1600{
1601 struct task_struct *dump_task = current;
1602 const struct user_regset_view *view = task_user_regset_view(dump_task);
1603 struct elf_thread_core_info *t;
1604 struct elf_prpsinfo *psinfo;
1605 struct task_struct *g, *p;
1606 unsigned int i;
1607
1608 info->size = 0;
1609 info->thread = NULL;
1610
1611 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1612 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1613
1614 if (psinfo == NULL)
1615 return 0;
1616
1617 /*
1618 * Figure out how many notes we're going to need for each thread.
1619 */
1620 info->thread_notes = 0;
1621 for (i = 0; i < view->n; ++i)
1622 if (view->regsets[i].core_note_type != 0)
1623 ++info->thread_notes;
1624
1625 /*
1626 * Sanity check. We rely on regset 0 being in NT_PRSTATUS,
1627 * since it is our one special case.
1628 */
1629 if (unlikely(info->thread_notes == 0) ||
1630 unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
1631 WARN_ON(1);
1632 return 0;
1633 }
1634
1635 /*
1636 * Initialize the ELF file header.
1637 */
1638 fill_elf_header(elf, phdrs,
1639 view->e_machine, view->e_flags, view->ei_osabi);
1640
1641 /*
1642 * Allocate a structure for each thread.
1643 */
1644 rcu_read_lock();
1645 do_each_thread(g, p)
1646 if (p->mm == dump_task->mm) {
1647 t = kzalloc(offsetof(struct elf_thread_core_info,
1648 notes[info->thread_notes]),
1649 GFP_ATOMIC);
1650 if (unlikely(!t)) {
1651 rcu_read_unlock();
1652 return 0;
1653 }
1654 t->task = p;
1655 if (p == dump_task || !info->thread) {
1656 t->next = info->thread;
1657 info->thread = t;
1658 } else {
1659 /*
1660 * Make sure to keep the original task at
1661 * the head of the list.
1662 */
1663 t->next = info->thread->next;
1664 info->thread->next = t;
1665 }
1666 }
1667 while_each_thread(g, p);
1668 rcu_read_unlock();
1669
1670 /*
1671 * Now fill in each thread's information.
1672 */
1673 for (t = info->thread; t != NULL; t = t->next)
1674 if (!fill_thread_core_info(t, view, signr, &info->size))
1675 return 0;
1676
1677 /*
1678 * Fill in the two process-wide notes.
1679 */
1680 fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
1681 info->size += notesize(&info->psinfo);
1682
1683 fill_auxv_note(&info->auxv, current->mm);
1684 info->size += notesize(&info->auxv);
1685
1686 return 1;
1687}
1688
1689static size_t get_note_info_size(struct elf_note_info *info)
1690{
1691 return info->size;
1692}
1693
1694/*
1695 * Write all the notes for each thread. When writing the first thread, the
1696 * process-wide notes are interleaved after the first thread-specific note.
1697 */
1698static int write_note_info(struct elf_note_info *info,
1699 struct file *file, loff_t *foffset)
1700{
1701 bool first = 1;
1702 struct elf_thread_core_info *t = info->thread;
1703
1704 do {
1705 int i;
1706
1707 if (!writenote(&t->notes[0], file, foffset))
1708 return 0;
1709
1710 if (first && !writenote(&info->psinfo, file, foffset))
1711 return 0;
1712 if (first && !writenote(&info->auxv, file, foffset))
1713 return 0;
1714
1715 for (i = 1; i < info->thread_notes; ++i)
1716 if (t->notes[i].data &&
1717 !writenote(&t->notes[i], file, foffset))
1718 return 0;
1719
1720 first = 0;
1721 t = t->next;
1722 } while (t);
1723
1724 return 1;
1725}
1726
1727static void free_note_info(struct elf_note_info *info)
1728{
1729 struct elf_thread_core_info *threads = info->thread;
1730 while (threads) {
1731 unsigned int i;
1732 struct elf_thread_core_info *t = threads;
1733 threads = t->next;
1734 WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
1735 for (i = 1; i < info->thread_notes; ++i)
1736 kfree(t->notes[i].data);
1737 kfree(t);
1738 }
1739 kfree(info->psinfo.data);
1740}
1741
1742#else
1743
1450/* Here is the structure in which status of each thread is captured. */ 1744/* Here is the structure in which status of each thread is captured. */
1451struct elf_thread_status 1745struct elf_thread_status
1452{ 1746{
@@ -1499,6 +1793,176 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1499 return sz; 1793 return sz;
1500} 1794}
1501 1795
1796struct elf_note_info {
1797 struct memelfnote *notes;
1798 struct elf_prstatus *prstatus; /* NT_PRSTATUS */
1799 struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
1800 struct list_head thread_list;
1801 elf_fpregset_t *fpu;
1802#ifdef ELF_CORE_COPY_XFPREGS
1803 elf_fpxregset_t *xfpu;
1804#endif
1805 int thread_status_size;
1806 int numnote;
1807};
1808
1809static int fill_note_info(struct elfhdr *elf, int phdrs,
1810 struct elf_note_info *info,
1811 long signr, struct pt_regs *regs)
1812{
1813#define NUM_NOTES 6
1814 struct list_head *t;
1815 struct task_struct *g, *p;
1816
1817 info->notes = NULL;
1818 info->prstatus = NULL;
1819 info->psinfo = NULL;
1820 info->fpu = NULL;
1821#ifdef ELF_CORE_COPY_XFPREGS
1822 info->xfpu = NULL;
1823#endif
1824 INIT_LIST_HEAD(&info->thread_list);
1825
1826 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
1827 GFP_KERNEL);
1828 if (!info->notes)
1829 return 0;
1830 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1831 if (!info->psinfo)
1832 return 0;
1833 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1834 if (!info->prstatus)
1835 return 0;
1836 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1837 if (!info->fpu)
1838 return 0;
1839#ifdef ELF_CORE_COPY_XFPREGS
1840 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1841 if (!info->xfpu)
1842 return 0;
1843#endif
1844
1845 info->thread_status_size = 0;
1846 if (signr) {
1847 struct elf_thread_status *tmp;
1848 rcu_read_lock();
1849 do_each_thread(g, p)
1850 if (current->mm == p->mm && current != p) {
1851 tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
1852 if (!tmp) {
1853 rcu_read_unlock();
1854 return 0;
1855 }
1856 tmp->thread = p;
1857 list_add(&tmp->list, &info->thread_list);
1858 }
1859 while_each_thread(g, p);
1860 rcu_read_unlock();
1861 list_for_each(t, &info->thread_list) {
1862 struct elf_thread_status *tmp;
1863 int sz;
1864
1865 tmp = list_entry(t, struct elf_thread_status, list);
1866 sz = elf_dump_thread_status(signr, tmp);
1867 info->thread_status_size += sz;
1868 }
1869 }
1870 /* now collect the dump for the current */
1871 memset(info->prstatus, 0, sizeof(*info->prstatus));
1872 fill_prstatus(info->prstatus, current, signr);
1873 elf_core_copy_regs(&info->prstatus->pr_reg, regs);
1874
1875 /* Set up header */
1876 fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
1877
1878 /*
1879 * Set up the notes in similar form to SVR4 core dumps made
1880 * with info from their /proc.
1881 */
1882
1883 fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
1884 sizeof(*info->prstatus), info->prstatus);
1885 fill_psinfo(info->psinfo, current->group_leader, current->mm);
1886 fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
1887 sizeof(*info->psinfo), info->psinfo);
1888
1889 info->numnote = 2;
1890
1891 fill_auxv_note(&info->notes[info->numnote++], current->mm);
1892
1893 /* Try to dump the FPU. */
1894 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
1895 info->fpu);
1896 if (info->prstatus->pr_fpvalid)
1897 fill_note(info->notes + info->numnote++,
1898 "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
1899#ifdef ELF_CORE_COPY_XFPREGS
1900 if (elf_core_copy_task_xfpregs(current, info->xfpu))
1901 fill_note(info->notes + info->numnote++,
1902 "LINUX", ELF_CORE_XFPREG_TYPE,
1903 sizeof(*info->xfpu), info->xfpu);
1904#endif
1905
1906 return 1;
1907
1908#undef NUM_NOTES
1909}
1910
1911static size_t get_note_info_size(struct elf_note_info *info)
1912{
1913 int sz = 0;
1914 int i;
1915
1916 for (i = 0; i < info->numnote; i++)
1917 sz += notesize(info->notes + i);
1918
1919 sz += info->thread_status_size;
1920
1921 return sz;
1922}
1923
1924static int write_note_info(struct elf_note_info *info,
1925 struct file *file, loff_t *foffset)
1926{
1927 int i;
1928 struct list_head *t;
1929
1930 for (i = 0; i < info->numnote; i++)
1931 if (!writenote(info->notes + i, file, foffset))
1932 return 0;
1933
1934 /* write out the thread status notes section */
1935 list_for_each(t, &info->thread_list) {
1936 struct elf_thread_status *tmp =
1937 list_entry(t, struct elf_thread_status, list);
1938
1939 for (i = 0; i < tmp->num_notes; i++)
1940 if (!writenote(&tmp->notes[i], file, foffset))
1941 return 0;
1942 }
1943
1944 return 1;
1945}
1946
1947static void free_note_info(struct elf_note_info *info)
1948{
1949 while (!list_empty(&info->thread_list)) {
1950 struct list_head *tmp = info->thread_list.next;
1951 list_del(tmp);
1952 kfree(list_entry(tmp, struct elf_thread_status, list));
1953 }
1954
1955 kfree(info->prstatus);
1956 kfree(info->psinfo);
1957 kfree(info->notes);
1958 kfree(info->fpu);
1959#ifdef ELF_CORE_COPY_XFPREGS
1960 kfree(info->xfpu);
1961#endif
1962}
1963
1964#endif
1965
1502static struct vm_area_struct *first_vma(struct task_struct *tsk, 1966static struct vm_area_struct *first_vma(struct task_struct *tsk,
1503 struct vm_area_struct *gate_vma) 1967 struct vm_area_struct *gate_vma)
1504{ 1968{
@@ -1534,29 +1998,15 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1534 */ 1998 */
1535static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) 1999static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
1536{ 2000{
1537#define NUM_NOTES 6
1538 int has_dumped = 0; 2001 int has_dumped = 0;
1539 mm_segment_t fs; 2002 mm_segment_t fs;
1540 int segs; 2003 int segs;
1541 size_t size = 0; 2004 size_t size = 0;
1542 int i;
1543 struct vm_area_struct *vma, *gate_vma; 2005 struct vm_area_struct *vma, *gate_vma;
1544 struct elfhdr *elf = NULL; 2006 struct elfhdr *elf = NULL;
1545 loff_t offset = 0, dataoff, foffset; 2007 loff_t offset = 0, dataoff, foffset;
1546 int numnote;
1547 struct memelfnote *notes = NULL;
1548 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
1549 struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */
1550 struct task_struct *g, *p;
1551 LIST_HEAD(thread_list);
1552 struct list_head *t;
1553 elf_fpregset_t *fpu = NULL;
1554#ifdef ELF_CORE_COPY_XFPREGS
1555 elf_fpxregset_t *xfpu = NULL;
1556#endif
1557 int thread_status_size = 0;
1558 elf_addr_t *auxv;
1559 unsigned long mm_flags; 2008 unsigned long mm_flags;
2009 struct elf_note_info info;
1560 2010
1561 /* 2011 /*
1562 * We no longer stop all VM operations. 2012 * We no longer stop all VM operations.
@@ -1574,52 +2024,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1574 elf = kmalloc(sizeof(*elf), GFP_KERNEL); 2024 elf = kmalloc(sizeof(*elf), GFP_KERNEL);
1575 if (!elf) 2025 if (!elf)
1576 goto cleanup; 2026 goto cleanup;
1577 prstatus = kmalloc(sizeof(*prstatus), GFP_KERNEL);
1578 if (!prstatus)
1579 goto cleanup;
1580 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1581 if (!psinfo)
1582 goto cleanup;
1583 notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
1584 if (!notes)
1585 goto cleanup;
1586 fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
1587 if (!fpu)
1588 goto cleanup;
1589#ifdef ELF_CORE_COPY_XFPREGS
1590 xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
1591 if (!xfpu)
1592 goto cleanup;
1593#endif
1594
1595 if (signr) {
1596 struct elf_thread_status *tmp;
1597 rcu_read_lock();
1598 do_each_thread(g,p)
1599 if (current->mm == p->mm && current != p) {
1600 tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
1601 if (!tmp) {
1602 rcu_read_unlock();
1603 goto cleanup;
1604 }
1605 tmp->thread = p;
1606 list_add(&tmp->list, &thread_list);
1607 }
1608 while_each_thread(g,p);
1609 rcu_read_unlock();
1610 list_for_each(t, &thread_list) {
1611 struct elf_thread_status *tmp;
1612 int sz;
1613
1614 tmp = list_entry(t, struct elf_thread_status, list);
1615 sz = elf_dump_thread_status(signr, tmp);
1616 thread_status_size += sz;
1617 }
1618 }
1619 /* now collect the dump for the current */
1620 memset(prstatus, 0, sizeof(*prstatus));
1621 fill_prstatus(prstatus, current, signr);
1622 elf_core_copy_regs(&prstatus->pr_reg, regs);
1623 2027
1624 segs = current->mm->map_count; 2028 segs = current->mm->map_count;
1625#ifdef ELF_CORE_EXTRA_PHDRS 2029#ifdef ELF_CORE_EXTRA_PHDRS
@@ -1630,42 +2034,16 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1630 if (gate_vma != NULL) 2034 if (gate_vma != NULL)
1631 segs++; 2035 segs++;
1632 2036
1633 /* Set up header */
1634 fill_elf_header(elf, segs + 1); /* including notes section */
1635
1636 has_dumped = 1;
1637 current->flags |= PF_DUMPCORE;
1638
1639 /* 2037 /*
1640 * Set up the notes in similar form to SVR4 core dumps made 2038 * Collect all the non-memory information about the process for the
1641 * with info from their /proc. 2039 * notes. This also sets up the file header.
1642 */ 2040 */
2041 if (!fill_note_info(elf, segs + 1, /* including notes section */
2042 &info, signr, regs))
2043 goto cleanup;
1643 2044
1644 fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus); 2045 has_dumped = 1;
1645 fill_psinfo(psinfo, current->group_leader, current->mm); 2046 current->flags |= PF_DUMPCORE;
1646 fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1647
1648 numnote = 2;
1649
1650 auxv = (elf_addr_t *)current->mm->saved_auxv;
1651
1652 i = 0;
1653 do
1654 i += 2;
1655 while (auxv[i - 2] != AT_NULL);
1656 fill_note(&notes[numnote++], "CORE", NT_AUXV,
1657 i * sizeof(elf_addr_t), auxv);
1658
1659 /* Try to dump the FPU. */
1660 if ((prstatus->pr_fpvalid =
1661 elf_core_copy_task_fpregs(current, regs, fpu)))
1662 fill_note(notes + numnote++,
1663 "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
1664#ifdef ELF_CORE_COPY_XFPREGS
1665 if (elf_core_copy_task_xfpregs(current, xfpu))
1666 fill_note(notes + numnote++,
1667 "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
1668#endif
1669 2047
1670 fs = get_fs(); 2048 fs = get_fs();
1671 set_fs(KERNEL_DS); 2049 set_fs(KERNEL_DS);
@@ -1678,12 +2056,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1678 /* Write notes phdr entry */ 2056 /* Write notes phdr entry */
1679 { 2057 {
1680 struct elf_phdr phdr; 2058 struct elf_phdr phdr;
1681 int sz = 0; 2059 size_t sz = get_note_info_size(&info);
1682
1683 for (i = 0; i < numnote; i++)
1684 sz += notesize(notes + i);
1685
1686 sz += thread_status_size;
1687 2060
1688 sz += elf_coredump_extra_notes_size(); 2061 sz += elf_coredump_extra_notes_size();
1689 2062
@@ -1728,23 +2101,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
1728#endif 2101#endif
1729 2102
1730 /* write out the notes section */ 2103 /* write out the notes section */
1731 for (i = 0; i < numnote; i++) 2104 if (!write_note_info(&info, file, &foffset))
1732 if (!writenote(notes + i, file, &foffset)) 2105 goto end_coredump;
1733 goto end_coredump;
1734 2106
1735 if (elf_coredump_extra_notes_write(file, &foffset)) 2107 if (elf_coredump_extra_notes_write(file, &foffset))
1736 goto end_coredump; 2108 goto end_coredump;
1737 2109
1738 /* write out the thread status notes section */
1739 list_for_each(t, &thread_list) {
1740 struct elf_thread_status *tmp =
1741 list_entry(t, struct elf_thread_status, list);
1742
1743 for (i = 0; i < tmp->num_notes; i++)
1744 if (!writenote(&tmp->notes[i], file, &foffset))
1745 goto end_coredump;
1746 }
1747
1748 /* Align to page */ 2110 /* Align to page */
1749 DUMP_SEEK(dataoff - foffset); 2111 DUMP_SEEK(dataoff - foffset);
1750 2112
@@ -1795,22 +2157,9 @@ end_coredump:
1795 set_fs(fs); 2157 set_fs(fs);
1796 2158
1797cleanup: 2159cleanup:
1798 while (!list_empty(&thread_list)) {
1799 struct list_head *tmp = thread_list.next;
1800 list_del(tmp);
1801 kfree(list_entry(tmp, struct elf_thread_status, list));
1802 }
1803
1804 kfree(elf); 2160 kfree(elf);
1805 kfree(prstatus); 2161 free_note_info(&info);
1806 kfree(psinfo);
1807 kfree(notes);
1808 kfree(fpu);
1809#ifdef ELF_CORE_COPY_XFPREGS
1810 kfree(xfpu);
1811#endif
1812 return has_dumped; 2162 return has_dumped;
1813#undef NUM_NOTES
1814} 2163}
1815 2164
1816#endif /* USE_ELF_CORE_DUMP */ 2165#endif /* USE_ELF_CORE_DUMP */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
new file mode 100644
index 000000000000..0adced2f296f
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,131 @@
1/*
2 * 32-bit compatibility support for ELF format executables and core dumps.
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 *
12 * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
13 * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
14 * used below, with definitions appropriate for 32-bit ABI compatibility.
15 *
16 * We use macros to rename the ABI types and machine-dependent
17 * functions used in binfmt_elf.c to compat versions.
18 */
19
20#include <linux/elfcore-compat.h>
21#include <linux/time.h>
22
23/*
24 * Rename the basic ELF layout types to refer to the 32-bit class of files.
25 */
26#undef ELF_CLASS
27#define ELF_CLASS ELFCLASS32
28
29#undef elfhdr
30#undef elf_phdr
31#undef elf_note
32#undef elf_addr_t
33#define elfhdr elf32_hdr
34#define elf_phdr elf32_phdr
35#define elf_note elf32_note
36#define elf_addr_t Elf32_Addr
37
38/*
39 * The machine-dependent core note format types are defined in elfcore-compat.h,
40 * which requires asm/elf.h to define compat_elf_gregset_t et al.
41 */
42#define elf_prstatus compat_elf_prstatus
43#define elf_prpsinfo compat_elf_prpsinfo
44
45/*
46 * Compat version of cputime_to_compat_timeval, perhaps this
47 * should be an inline in <linux/compat.h>.
48 */
49static void cputime_to_compat_timeval(const cputime_t cputime,
50 struct compat_timeval *value)
51{
52 struct timeval tv;
53 cputime_to_timeval(cputime, &tv);
54 value->tv_sec = tv.tv_sec;
55 value->tv_usec = tv.tv_usec;
56}
57
58#undef cputime_to_timeval
59#define cputime_to_timeval cputime_to_compat_timeval
60
61
62/*
63 * To use this file, asm/elf.h must define compat_elf_check_arch.
64 * The other following macros can be defined if the compat versions
65 * differ from the native ones, or omitted when they match.
66 */
67
68#undef ELF_ARCH
69#undef elf_check_arch
70#define elf_check_arch compat_elf_check_arch
71
72#ifdef COMPAT_ELF_PLATFORM
73#undef ELF_PLATFORM
74#define ELF_PLATFORM COMPAT_ELF_PLATFORM
75#endif
76
77#ifdef COMPAT_ELF_HWCAP
78#undef ELF_HWCAP
79#define ELF_HWCAP COMPAT_ELF_HWCAP
80#endif
81
82#ifdef COMPAT_ARCH_DLINFO
83#undef ARCH_DLINFO
84#define ARCH_DLINFO COMPAT_ARCH_DLINFO
85#endif
86
87#ifdef COMPAT_ELF_ET_DYN_BASE
88#undef ELF_ET_DYN_BASE
89#define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE
90#endif
91
92#ifdef COMPAT_ELF_EXEC_PAGESIZE
93#undef ELF_EXEC_PAGESIZE
94#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE
95#endif
96
97#ifdef COMPAT_ELF_PLAT_INIT
98#undef ELF_PLAT_INIT
99#define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT
100#endif
101
102#ifdef COMPAT_SET_PERSONALITY
103#undef SET_PERSONALITY
104#define SET_PERSONALITY COMPAT_SET_PERSONALITY
105#endif
106
107#ifdef compat_start_thread
108#undef start_thread
109#define start_thread compat_start_thread
110#endif
111
112#ifdef compat_arch_setup_additional_pages
113#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES
114#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
115#undef arch_setup_additional_pages
116#define arch_setup_additional_pages compat_arch_setup_additional_pages
117#endif
118
119/*
120 * Rename a few of the symbols that binfmt_elf.c will define.
121 * These are all local so the names don't really matter, but it
122 * might make some debugging less confusing not to duplicate them.
123 */
124#define elf_format compat_elf_format
125#define init_elf_binfmt init_compat_elf_binfmt
126#define exit_elf_binfmt exit_compat_elf_binfmt
127
128/*
129 * We share all the actual code with the native (64-bit) version.
130 */
131#include "binfmt_elf.c"
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 46754553fdcc..ff97ba924333 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = allocate_direntry(ls, len); 52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
53 return de; 53 return de;
54} 54}
55 55
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, 62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list); 63 list);
64 list_del(&de->list); 64 list_del(&de->list);
65 free_direntry(de); 65 kfree(de);
66 } 66 }
67 spin_unlock(&ls->ls_recover_list_lock); 67 spin_unlock(&ls->ls_recover_list_lock);
68} 68}
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
171 } 171 }
172 172
173 list_del(&de->list); 173 list_del(&de->list);
174 free_direntry(de); 174 kfree(de);
175 out: 175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock); 176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177} 177}
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
302 302
303 write_unlock(&ls->ls_dirtbl[bucket].lock); 303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304 304
305 de = allocate_direntry(ls, namelen); 305 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
306 if (!de) 306 if (!de)
307 return -ENOMEM; 307 return -ENOMEM;
308 308
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
313 write_lock(&ls->ls_dirtbl[bucket].lock); 313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket); 314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) { 315 if (tmp) {
316 free_direntry(de); 316 kfree(de);
317 de = tmp; 317 de = tmp;
318 } else { 318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
329 return get_entry(ls, nodeid, name, namelen, r_nodeid); 329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330} 330}
331 331
332/* Copy the names of master rsb's into the buffer provided. 332static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
333 Only select names whose dir node is the given nodeid. */ 333{
334 struct dlm_rsb *r;
335
336 down_read(&ls->ls_root_sem);
337 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
338 if (len == r->res_length && !memcmp(name, r->res_name, len)) {
339 up_read(&ls->ls_root_sem);
340 return r;
341 }
342 }
343 up_read(&ls->ls_root_sem);
344 return NULL;
345}
346
347/* Find the rsb where we left off (or start again), then send rsb names
348 for rsb's we're master of and whose directory node matches the requesting
349 node. inbuf is the rsb name last sent, inlen is the name's length */
334 350
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 351void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid) 352 char *outbuf, int outlen, int nodeid)
337{ 353{
338 struct list_head *list; 354 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL; 355 struct dlm_rsb *r;
340 int offset = 0, start_namelen, error, dir_nodeid; 356 int offset = 0, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen; 357 uint16_t be_namelen;
343 358
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem); 359 down_read(&ls->ls_root_sem);
371 if (start_r) 360
372 list = start_r->res_root_list.next; 361 if (inlen > 1) {
373 else 362 r = find_rsb_root(ls, inbuf, inlen);
363 if (!r) {
364 inbuf[inlen - 1] = '\0';
365 log_error(ls, "copy_master_names from %d start %d %s",
366 nodeid, inlen, inbuf);
367 goto out;
368 }
369 list = r->res_root_list.next;
370 } else {
374 list = ls->ls_root_list.next; 371 list = ls->ls_root_list.next;
372 }
375 373
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) { 374 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list); 375 r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc2384c3be..ec61bbaf25df 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; 570 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
571} 571}
572 572
573int dlm_netlink_init(void);
574void dlm_netlink_exit(void);
575void dlm_timeout_warn(struct dlm_lkb *lkb);
576
577#ifdef CONFIG_DLM_DEBUG
578int dlm_register_debugfs(void);
579void dlm_unregister_debugfs(void);
580int dlm_create_debug_file(struct dlm_ls *ls);
581void dlm_delete_debug_file(struct dlm_ls *ls);
582#else
583static inline int dlm_register_debugfs(void) { return 0; }
584static inline void dlm_unregister_debugfs(void) { }
585static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
586static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
587#endif
588
573#endif /* __DLM_INTERNAL_DOT_H__ */ 589#endif /* __DLM_INTERNAL_DOT_H__ */
574 590
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e14146..ff4a198fa677 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
88static int receive_extralen(struct dlm_message *ms); 88static int receive_extralen(struct dlm_message *ms);
89static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 89static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
90static void del_timeout(struct dlm_lkb *lkb); 90static void del_timeout(struct dlm_lkb *lkb);
91void dlm_timeout_warn(struct dlm_lkb *lkb);
92 91
93/* 92/*
94 * Lock compatibilty matrix - thanks Steve 93 * Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
335{ 334{
336 struct dlm_rsb *r; 335 struct dlm_rsb *r;
337 336
338 r = allocate_rsb(ls, len); 337 r = dlm_allocate_rsb(ls, len);
339 if (!r) 338 if (!r)
340 return NULL; 339 return NULL;
341 340
@@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
478 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 477 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
479 if (!error) { 478 if (!error) {
480 write_unlock(&ls->ls_rsbtbl[bucket].lock); 479 write_unlock(&ls->ls_rsbtbl[bucket].lock);
481 free_rsb(r); 480 dlm_free_rsb(r);
482 r = tmp; 481 r = tmp;
483 goto out; 482 goto out;
484 } 483 }
@@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
490 return error; 489 return error;
491} 490}
492 491
493int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
494 unsigned int flags, struct dlm_rsb **r_ret)
495{
496 return find_rsb(ls, name, namelen, flags, r_ret);
497}
498
499/* This is only called to add a reference when the code already holds 492/* This is only called to add a reference when the code already holds
500 a valid reference to the rsb, so there's no need for locking. */ 493 a valid reference to the rsb, so there's no need for locking. */
501 494
@@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref)
519 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); 512 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
520 r->res_toss_time = jiffies; 513 r->res_toss_time = jiffies;
521 if (r->res_lvbptr) { 514 if (r->res_lvbptr) {
522 free_lvb(r->res_lvbptr); 515 dlm_free_lvb(r->res_lvbptr);
523 r->res_lvbptr = NULL; 516 r->res_lvbptr = NULL;
524 } 517 }
525} 518}
@@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
589 uint32_t lkid = 0; 582 uint32_t lkid = 0;
590 uint16_t bucket; 583 uint16_t bucket;
591 584
592 lkb = allocate_lkb(ls); 585 lkb = dlm_allocate_lkb(ls);
593 if (!lkb) 586 if (!lkb)
594 return -ENOMEM; 587 return -ENOMEM;
595 588
@@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
683 676
684 /* for local/process lkbs, lvbptr points to caller's lksb */ 677 /* for local/process lkbs, lvbptr points to caller's lksb */
685 if (lkb->lkb_lvbptr && is_master_copy(lkb)) 678 if (lkb->lkb_lvbptr && is_master_copy(lkb))
686 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
687 free_lkb(lkb); 680 dlm_free_lkb(lkb);
688 return 1; 681 return 1;
689 } else { 682 } else {
690 write_unlock(&ls->ls_lkbtbl[bucket].lock); 683 write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
988 981
989 if (is_master(r)) 982 if (is_master(r))
990 dir_remove(r); 983 dir_remove(r);
991 free_rsb(r); 984 dlm_free_rsb(r);
992 count++; 985 count++;
993 } else { 986 } else {
994 write_unlock(&ls->ls_rsbtbl[b].lock); 987 write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1171 return; 1164 return;
1172 1165
1173 if (!r->res_lvbptr) 1166 if (!r->res_lvbptr)
1174 r->res_lvbptr = allocate_lvb(r->res_ls); 1167 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1175 1168
1176 if (!r->res_lvbptr) 1169 if (!r->res_lvbptr)
1177 return; 1170 return;
@@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1203 return; 1196 return;
1204 1197
1205 if (!r->res_lvbptr) 1198 if (!r->res_lvbptr)
1206 r->res_lvbptr = allocate_lvb(r->res_ls); 1199 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1207 1200
1208 if (!r->res_lvbptr) 1201 if (!r->res_lvbptr)
1209 return; 1202 return;
@@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1852static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 1845static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1853{ 1846{
1854 struct dlm_ls *ls = r->res_ls; 1847 struct dlm_ls *ls = r->res_ls;
1855 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 1848 int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1856 1849
1857 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 1850 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1858 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 1851 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1886 return 1; 1879 return 1;
1887 } 1880 }
1888 1881
1889 for (;;) { 1882 for (i = 0; i < 2; i++) {
1890 /* It's possible for dlm_scand to remove an old rsb for 1883 /* It's possible for dlm_scand to remove an old rsb for
1891 this same resource from the toss list, us to create 1884 this same resource from the toss list, us to create
1892 a new one, look up the master locally, and find it 1885 a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1900 log_debug(ls, "dir_lookup error %d %s", error, r->res_name); 1893 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1901 schedule(); 1894 schedule();
1902 } 1895 }
1896 if (error && error != -EEXIST)
1897 return error;
1903 1898
1904 if (ret_nodeid == our_nodeid) { 1899 if (ret_nodeid == our_nodeid) {
1905 r->res_first_lkid = 0; 1900 r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
1941 break; 1936 break;
1942 1937
1943 case -EAGAIN: 1938 case -EAGAIN:
1944 /* the remote master didn't queue our NOQUEUE request; 1939 case -EBADR:
1945 make a waiting lkb the first_lkid */ 1940 case -ENOTBLK:
1941 /* the remote request failed and won't be retried (it was
1942 a NOQUEUE, or has been canceled/unlocked); make a waiting
1943 lkb the first_lkid */
1946 1944
1947 r->res_first_lkid = 0; 1945 r->res_first_lkid = 0;
1948 1946
@@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2108 /* an lkb may be waiting for an rsb lookup to complete where the 2106 /* an lkb may be waiting for an rsb lookup to complete where the
2109 lookup was initiated by another lock */ 2107 lookup was initiated by another lock */
2110 2108
2111 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { 2109 if (!list_empty(&lkb->lkb_rsb_lookup)) {
2112 if (!list_empty(&lkb->lkb_rsb_lookup)) { 2110 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2113 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); 2111 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2114 list_del_init(&lkb->lkb_rsb_lookup); 2112 list_del_init(&lkb->lkb_rsb_lookup);
2115 queue_cast(lkb->lkb_resource, lkb, 2113 queue_cast(lkb->lkb_resource, lkb,
2116 args->flags & DLM_LKF_CANCEL ? 2114 args->flags & DLM_LKF_CANCEL ?
2117 -DLM_ECANCEL : -DLM_EUNLOCK); 2115 -DLM_ECANCEL : -DLM_EUNLOCK);
2118 unhold_lkb(lkb); /* undoes create_lkb() */ 2116 unhold_lkb(lkb); /* undoes create_lkb() */
2119 rv = -EBUSY;
2120 goto out;
2121 } 2117 }
2118 /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
2119 rv = -EBUSY;
2120 goto out;
2122 } 2121 }
2123 2122
2124 /* cancel not allowed with another cancel/unlock in progress */ 2123 /* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2986 2985
2987 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 2986 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2988 if (!lkb->lkb_lvbptr) 2987 if (!lkb->lkb_lvbptr)
2989 lkb->lkb_lvbptr = allocate_lvb(ls); 2988 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
2990 if (!lkb->lkb_lvbptr) 2989 if (!lkb->lkb_lvbptr)
2991 return -ENOMEM; 2990 return -ENOMEM;
2992 len = receive_extralen(ms); 2991 len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3006 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); 3005 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
3007 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); 3006 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
3008 3007
3009 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
3010
3011 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3008 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3012 /* lkb was just created so there won't be an lvb yet */ 3009 /* lkb was just created so there won't be an lvb yet */
3013 lkb->lkb_lvbptr = allocate_lvb(ls); 3010 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3014 if (!lkb->lkb_lvbptr) 3011 if (!lkb->lkb_lvbptr)
3015 return -ENOMEM; 3012 return -ENOMEM;
3016 } 3013 }
@@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3021static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3018static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3022 struct dlm_message *ms) 3019 struct dlm_message *ms)
3023{ 3020{
3024 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
3025 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
3026 lkb->lkb_nodeid, ms->m_header.h_nodeid,
3027 lkb->lkb_id, lkb->lkb_remid);
3028 return -EINVAL;
3029 }
3030
3031 if (!is_master_copy(lkb))
3032 return -EINVAL;
3033
3034 if (lkb->lkb_status != DLM_LKSTS_GRANTED) 3021 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3035 return -EBUSY; 3022 return -EBUSY;
3036 3023
@@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3046static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3033static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3047 struct dlm_message *ms) 3034 struct dlm_message *ms)
3048{ 3035{
3049 if (!is_master_copy(lkb))
3050 return -EINVAL;
3051 if (receive_lvb(ls, lkb, ms)) 3036 if (receive_lvb(ls, lkb, ms))
3052 return -ENOMEM; 3037 return -ENOMEM;
3053 return 0; 3038 return 0;
@@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
3063 lkb->lkb_remid = ms->m_lkid; 3048 lkb->lkb_remid = ms->m_lkid;
3064} 3049}
3065 3050
3051/* This is called after the rsb is locked so that we can safely inspect
3052 fields in the lkb. */
3053
3054static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
3055{
3056 int from = ms->m_header.h_nodeid;
3057 int error = 0;
3058
3059 switch (ms->m_type) {
3060 case DLM_MSG_CONVERT:
3061 case DLM_MSG_UNLOCK:
3062 case DLM_MSG_CANCEL:
3063 if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
3064 error = -EINVAL;
3065 break;
3066
3067 case DLM_MSG_CONVERT_REPLY:
3068 case DLM_MSG_UNLOCK_REPLY:
3069 case DLM_MSG_CANCEL_REPLY:
3070 case DLM_MSG_GRANT:
3071 case DLM_MSG_BAST:
3072 if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
3073 error = -EINVAL;
3074 break;
3075
3076 case DLM_MSG_REQUEST_REPLY:
3077 if (!is_process_copy(lkb))
3078 error = -EINVAL;
3079 else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
3080 error = -EINVAL;
3081 break;
3082
3083 default:
3084 error = -EINVAL;
3085 }
3086
3087 if (error)
3088 log_error(lkb->lkb_resource->res_ls,
3089 "ignore invalid message %d from %d %x %x %x %d",
3090 ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
3091 lkb->lkb_flags, lkb->lkb_nodeid);
3092 return error;
3093}
3094
3066static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) 3095static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3067{ 3096{
3068 struct dlm_lkb *lkb; 3097 struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3124 hold_rsb(r); 3153 hold_rsb(r);
3125 lock_rsb(r); 3154 lock_rsb(r);
3126 3155
3156 error = validate_message(lkb, ms);
3157 if (error)
3158 goto out;
3159
3127 receive_flags(lkb, ms); 3160 receive_flags(lkb, ms);
3128 error = receive_convert_args(ls, lkb, ms); 3161 error = receive_convert_args(ls, lkb, ms);
3129 if (error) 3162 if (error)
3130 goto out; 3163 goto out_reply;
3131 reply = !down_conversion(lkb); 3164 reply = !down_conversion(lkb);
3132 3165
3133 error = do_convert(r, lkb); 3166 error = do_convert(r, lkb);
3134 out: 3167 out_reply:
3135 if (reply) 3168 if (reply)
3136 send_convert_reply(r, lkb, error); 3169 send_convert_reply(r, lkb, error);
3137 3170 out:
3138 unlock_rsb(r); 3171 unlock_rsb(r);
3139 put_rsb(r); 3172 put_rsb(r);
3140 dlm_put_lkb(lkb); 3173 dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3160 hold_rsb(r); 3193 hold_rsb(r);
3161 lock_rsb(r); 3194 lock_rsb(r);
3162 3195
3196 error = validate_message(lkb, ms);
3197 if (error)
3198 goto out;
3199
3163 receive_flags(lkb, ms); 3200 receive_flags(lkb, ms);
3164 error = receive_unlock_args(ls, lkb, ms); 3201 error = receive_unlock_args(ls, lkb, ms);
3165 if (error) 3202 if (error)
3166 goto out; 3203 goto out_reply;
3167 3204
3168 error = do_unlock(r, lkb); 3205 error = do_unlock(r, lkb);
3169 out: 3206 out_reply:
3170 send_unlock_reply(r, lkb, error); 3207 send_unlock_reply(r, lkb, error);
3171 3208 out:
3172 unlock_rsb(r); 3209 unlock_rsb(r);
3173 put_rsb(r); 3210 put_rsb(r);
3174 dlm_put_lkb(lkb); 3211 dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3196 hold_rsb(r); 3233 hold_rsb(r);
3197 lock_rsb(r); 3234 lock_rsb(r);
3198 3235
3236 error = validate_message(lkb, ms);
3237 if (error)
3238 goto out;
3239
3199 error = do_cancel(r, lkb); 3240 error = do_cancel(r, lkb);
3200 send_cancel_reply(r, lkb, error); 3241 send_cancel_reply(r, lkb, error);
3201 3242 out:
3202 unlock_rsb(r); 3243 unlock_rsb(r);
3203 put_rsb(r); 3244 put_rsb(r);
3204 dlm_put_lkb(lkb); 3245 dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
3217 3258
3218 error = find_lkb(ls, ms->m_remid, &lkb); 3259 error = find_lkb(ls, ms->m_remid, &lkb);
3219 if (error) { 3260 if (error) {
3220 log_error(ls, "receive_grant no lkb"); 3261 log_debug(ls, "receive_grant from %d no lkb %x",
3262 ms->m_header.h_nodeid, ms->m_remid);
3221 return; 3263 return;
3222 } 3264 }
3223 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3224 3265
3225 r = lkb->lkb_resource; 3266 r = lkb->lkb_resource;
3226 3267
3227 hold_rsb(r); 3268 hold_rsb(r);
3228 lock_rsb(r); 3269 lock_rsb(r);
3229 3270
3271 error = validate_message(lkb, ms);
3272 if (error)
3273 goto out;
3274
3230 receive_flags_reply(lkb, ms); 3275 receive_flags_reply(lkb, ms);
3231 if (is_altmode(lkb)) 3276 if (is_altmode(lkb))
3232 munge_altmode(lkb, ms); 3277 munge_altmode(lkb, ms);
3233 grant_lock_pc(r, lkb, ms); 3278 grant_lock_pc(r, lkb, ms);
3234 queue_cast(r, lkb, 0); 3279 queue_cast(r, lkb, 0);
3235 3280 out:
3236 unlock_rsb(r); 3281 unlock_rsb(r);
3237 put_rsb(r); 3282 put_rsb(r);
3238 dlm_put_lkb(lkb); 3283 dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
3246 3291
3247 error = find_lkb(ls, ms->m_remid, &lkb); 3292 error = find_lkb(ls, ms->m_remid, &lkb);
3248 if (error) { 3293 if (error) {
3249 log_error(ls, "receive_bast no lkb"); 3294 log_debug(ls, "receive_bast from %d no lkb %x",
3295 ms->m_header.h_nodeid, ms->m_remid);
3250 return; 3296 return;
3251 } 3297 }
3252 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3253 3298
3254 r = lkb->lkb_resource; 3299 r = lkb->lkb_resource;
3255 3300
3256 hold_rsb(r); 3301 hold_rsb(r);
3257 lock_rsb(r); 3302 lock_rsb(r);
3258 3303
3259 queue_bast(r, lkb, ms->m_bastmode); 3304 error = validate_message(lkb, ms);
3305 if (error)
3306 goto out;
3260 3307
3308 queue_bast(r, lkb, ms->m_bastmode);
3309 out:
3261 unlock_rsb(r); 3310 unlock_rsb(r);
3262 put_rsb(r); 3311 put_rsb(r);
3263 dlm_put_lkb(lkb); 3312 dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3323 3372
3324 error = find_lkb(ls, ms->m_remid, &lkb); 3373 error = find_lkb(ls, ms->m_remid, &lkb);
3325 if (error) { 3374 if (error) {
3326 log_error(ls, "receive_request_reply no lkb"); 3375 log_debug(ls, "receive_request_reply from %d no lkb %x",
3376 ms->m_header.h_nodeid, ms->m_remid);
3327 return; 3377 return;
3328 } 3378 }
3329 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3330 3379
3331 r = lkb->lkb_resource; 3380 r = lkb->lkb_resource;
3332 hold_rsb(r); 3381 hold_rsb(r);
3333 lock_rsb(r); 3382 lock_rsb(r);
3334 3383
3384 error = validate_message(lkb, ms);
3385 if (error)
3386 goto out;
3387
3335 mstype = lkb->lkb_wait_type; 3388 mstype = lkb->lkb_wait_type;
3336 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 3389 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
3337 if (error) 3390 if (error)
@@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3383 if (is_overlap(lkb)) { 3436 if (is_overlap(lkb)) {
3384 /* we'll ignore error in cancel/unlock reply */ 3437 /* we'll ignore error in cancel/unlock reply */
3385 queue_cast_overlap(r, lkb); 3438 queue_cast_overlap(r, lkb);
3439 confirm_master(r, result);
3386 unhold_lkb(lkb); /* undoes create_lkb() */ 3440 unhold_lkb(lkb); /* undoes create_lkb() */
3387 } else 3441 } else
3388 _request_lock(r, lkb); 3442 _request_lock(r, lkb);
@@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3463 hold_rsb(r); 3517 hold_rsb(r);
3464 lock_rsb(r); 3518 lock_rsb(r);
3465 3519
3520 error = validate_message(lkb, ms);
3521 if (error)
3522 goto out;
3523
3466 /* stub reply can happen with waiters_mutex held */ 3524 /* stub reply can happen with waiters_mutex held */
3467 error = remove_from_waiters_ms(lkb, ms); 3525 error = remove_from_waiters_ms(lkb, ms);
3468 if (error) 3526 if (error)
@@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
3481 3539
3482 error = find_lkb(ls, ms->m_remid, &lkb); 3540 error = find_lkb(ls, ms->m_remid, &lkb);
3483 if (error) { 3541 if (error) {
3484 log_error(ls, "receive_convert_reply no lkb"); 3542 log_debug(ls, "receive_convert_reply from %d no lkb %x",
3543 ms->m_header.h_nodeid, ms->m_remid);
3485 return; 3544 return;
3486 } 3545 }
3487 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3488 3546
3489 _receive_convert_reply(lkb, ms); 3547 _receive_convert_reply(lkb, ms);
3490 dlm_put_lkb(lkb); 3548 dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3498 hold_rsb(r); 3556 hold_rsb(r);
3499 lock_rsb(r); 3557 lock_rsb(r);
3500 3558
3559 error = validate_message(lkb, ms);
3560 if (error)
3561 goto out;
3562
3501 /* stub reply can happen with waiters_mutex held */ 3563 /* stub reply can happen with waiters_mutex held */
3502 error = remove_from_waiters_ms(lkb, ms); 3564 error = remove_from_waiters_ms(lkb, ms);
3503 if (error) 3565 if (error)
@@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
3529 3591
3530 error = find_lkb(ls, ms->m_remid, &lkb); 3592 error = find_lkb(ls, ms->m_remid, &lkb);
3531 if (error) { 3593 if (error) {
3532 log_error(ls, "receive_unlock_reply no lkb"); 3594 log_debug(ls, "receive_unlock_reply from %d no lkb %x",
3595 ms->m_header.h_nodeid, ms->m_remid);
3533 return; 3596 return;
3534 } 3597 }
3535 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3536 3598
3537 _receive_unlock_reply(lkb, ms); 3599 _receive_unlock_reply(lkb, ms);
3538 dlm_put_lkb(lkb); 3600 dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3546 hold_rsb(r); 3608 hold_rsb(r);
3547 lock_rsb(r); 3609 lock_rsb(r);
3548 3610
3611 error = validate_message(lkb, ms);
3612 if (error)
3613 goto out;
3614
3549 /* stub reply can happen with waiters_mutex held */ 3615 /* stub reply can happen with waiters_mutex held */
3550 error = remove_from_waiters_ms(lkb, ms); 3616 error = remove_from_waiters_ms(lkb, ms);
3551 if (error) 3617 if (error)
@@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
3577 3643
3578 error = find_lkb(ls, ms->m_remid, &lkb); 3644 error = find_lkb(ls, ms->m_remid, &lkb);
3579 if (error) { 3645 if (error) {
3580 log_error(ls, "receive_cancel_reply no lkb"); 3646 log_debug(ls, "receive_cancel_reply from %d no lkb %x",
3647 ms->m_header.h_nodeid, ms->m_remid);
3581 return; 3648 return;
3582 } 3649 }
3583 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3584 3650
3585 _receive_cancel_reply(lkb, ms); 3651 _receive_cancel_reply(lkb, ms);
3586 dlm_put_lkb(lkb); 3652 dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3640 3706
3641static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) 3707static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3642{ 3708{
3709 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
3710 log_debug(ls, "ignore non-member message %d from %d %x %x %d",
3711 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
3712 ms->m_remid, ms->m_result);
3713 return;
3714 }
3715
3643 switch (ms->m_type) { 3716 switch (ms->m_type) {
3644 3717
3645 /* messages sent to a master node */ 3718 /* messages sent to a master node */
@@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
3778 3851
3779 ls = dlm_find_lockspace_global(hd->h_lockspace); 3852 ls = dlm_find_lockspace_global(hd->h_lockspace);
3780 if (!ls) { 3853 if (!ls) {
3781 log_print("invalid h_lockspace %x from %d cmd %d type %d", 3854 if (dlm_config.ci_log_debug)
3782 hd->h_lockspace, nodeid, hd->h_cmd, type); 3855 log_print("invalid lockspace %x from %d cmd %d type %d",
3856 hd->h_lockspace, nodeid, hd->h_cmd, type);
3783 3857
3784 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) 3858 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3785 dlm_send_ls_not_ready(nodeid, rc); 3859 dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3806 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; 3880 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
3807 ls->ls_stub_ms.m_result = -EINPROGRESS; 3881 ls->ls_stub_ms.m_result = -EINPROGRESS;
3808 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3883 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3809 _receive_convert_reply(lkb, &ls->ls_stub_ms); 3884 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3810 3885
3811 /* Same special case as in receive_rcom_lock_args() */ 3886 /* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3847void dlm_recover_waiters_pre(struct dlm_ls *ls) 3922void dlm_recover_waiters_pre(struct dlm_ls *ls)
3848{ 3923{
3849 struct dlm_lkb *lkb, *safe; 3924 struct dlm_lkb *lkb, *safe;
3925 int wait_type, stub_unlock_result, stub_cancel_result;
3850 3926
3851 mutex_lock(&ls->ls_waiters_mutex); 3927 mutex_lock(&ls->ls_waiters_mutex);
3852 3928
@@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3865 if (!waiter_needs_recovery(ls, lkb)) 3941 if (!waiter_needs_recovery(ls, lkb))
3866 continue; 3942 continue;
3867 3943
3868 switch (lkb->lkb_wait_type) { 3944 wait_type = lkb->lkb_wait_type;
3945 stub_unlock_result = -DLM_EUNLOCK;
3946 stub_cancel_result = -DLM_ECANCEL;
3947
3948 /* Main reply may have been received leaving a zero wait_type,
3949 but a reply for the overlapping op may not have been
3950 received. In that case we need to fake the appropriate
3951 reply for the overlap op. */
3952
3953 if (!wait_type) {
3954 if (is_overlap_cancel(lkb)) {
3955 wait_type = DLM_MSG_CANCEL;
3956 if (lkb->lkb_grmode == DLM_LOCK_IV)
3957 stub_cancel_result = 0;
3958 }
3959 if (is_overlap_unlock(lkb)) {
3960 wait_type = DLM_MSG_UNLOCK;
3961 if (lkb->lkb_grmode == DLM_LOCK_IV)
3962 stub_unlock_result = -ENOENT;
3963 }
3964
3965 log_debug(ls, "rwpre overlap %x %x %d %d %d",
3966 lkb->lkb_id, lkb->lkb_flags, wait_type,
3967 stub_cancel_result, stub_unlock_result);
3968 }
3969
3970 switch (wait_type) {
3869 3971
3870 case DLM_MSG_REQUEST: 3972 case DLM_MSG_REQUEST:
3871 lkb->lkb_flags |= DLM_IFL_RESEND; 3973 lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3878 case DLM_MSG_UNLOCK: 3980 case DLM_MSG_UNLOCK:
3879 hold_lkb(lkb); 3981 hold_lkb(lkb);
3880 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; 3982 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
3881 ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3983 ls->ls_stub_ms.m_result = stub_unlock_result;
3882 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3984 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3985 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3883 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3986 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3884 dlm_put_lkb(lkb); 3987 dlm_put_lkb(lkb);
3885 break; 3988 break;
@@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3887 case DLM_MSG_CANCEL: 3990 case DLM_MSG_CANCEL:
3888 hold_lkb(lkb); 3991 hold_lkb(lkb);
3889 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; 3992 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
3890 ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3993 ls->ls_stub_ms.m_result = stub_cancel_result;
3891 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3994 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3995 ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
3892 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3996 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3893 dlm_put_lkb(lkb); 3997 dlm_put_lkb(lkb);
3894 break; 3998 break;
3895 3999
3896 default: 4000 default:
3897 log_error(ls, "invalid lkb wait_type %d", 4001 log_error(ls, "invalid lkb wait_type %d %d",
3898 lkb->lkb_wait_type); 4002 lkb->lkb_wait_type, wait_type);
3899 } 4003 }
3900 schedule(); 4004 schedule();
3901 } 4005 }
@@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4184 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); 4288 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
4185 4289
4186 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 4290 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4187 lkb->lkb_lvbptr = allocate_lvb(ls); 4291 lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
4188 if (!lkb->lkb_lvbptr) 4292 if (!lkb->lkb_lvbptr)
4189 return -ENOMEM; 4293 return -ENOMEM;
4190 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 4294 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4259 put_rsb(r); 4363 put_rsb(r);
4260 out: 4364 out:
4261 if (error) 4365 if (error)
4262 log_print("recover_master_copy %d %x", error, rl->rl_lkid); 4366 log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
4263 rl->rl_result = error; 4367 rl->rl_result = error;
4264 return error; 4368 return error;
4265} 4369}
@@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4342 } 4446 }
4343 } 4447 }
4344 4448
4345 /* After ua is attached to lkb it will be freed by free_lkb(). 4449 /* After ua is attached to lkb it will be freed by dlm_free_lkb().
4346 When DLM_IFL_USER is set, the dlm knows that this is a userspace 4450 When DLM_IFL_USER is set, the dlm knows that this is a userspace
4347 lock and that lkb_astparam is the dlm_user_args structure. */ 4451 lock and that lkb_astparam is the dlm_user_args structure. */
4348 4452
@@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4679 } 4783 }
4680 4784
4681 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4785 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4786 lkb->lkb_ast_type = 0;
4682 list_del(&lkb->lkb_astqueue); 4787 list_del(&lkb->lkb_astqueue);
4683 dlm_put_lkb(lkb); 4788 dlm_put_lkb(lkb);
4684 } 4789 }
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada04680a1e5..27b6ed302911 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); 19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20void dlm_receive_buffer(struct dlm_header *hd, int nodeid); 20void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
21int dlm_modes_compat(int mode1, int mode2); 21int dlm_modes_compat(int mode1, int mode2);
22int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
23 unsigned int flags, struct dlm_rsb **r_ret);
24void dlm_put_rsb(struct dlm_rsb *r); 22void dlm_put_rsb(struct dlm_rsb *r);
25void dlm_hold_rsb(struct dlm_rsb *r); 23void dlm_hold_rsb(struct dlm_rsb *r);
26int dlm_put_lkb(struct dlm_lkb *lkb); 24int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5c108c49cb8c..b180fdc51085 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26 26
27#ifdef CONFIG_DLM_DEBUG
28int dlm_create_debug_file(struct dlm_ls *ls);
29void dlm_delete_debug_file(struct dlm_ls *ls);
30#else
31static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
32static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
33#endif
34
35static int ls_count; 27static int ls_count;
36static struct mutex ls_lock; 28static struct mutex ls_lock;
37static struct list_head lslist; 29static struct list_head lslist;
@@ -684,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
684 dlm_del_ast(lkb); 676 dlm_del_ast(lkb);
685 677
686 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) 678 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
687 free_lvb(lkb->lkb_lvbptr); 679 dlm_free_lvb(lkb->lkb_lvbptr);
688 680
689 free_lkb(lkb); 681 dlm_free_lkb(lkb);
690 } 682 }
691 } 683 }
692 dlm_astd_resume(); 684 dlm_astd_resume();
@@ -704,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
704 res_hashchain); 696 res_hashchain);
705 697
706 list_del(&rsb->res_hashchain); 698 list_del(&rsb->res_hashchain);
707 free_rsb(rsb); 699 dlm_free_rsb(rsb);
708 } 700 }
709 701
710 head = &ls->ls_rsbtbl[i].toss; 702 head = &ls->ls_rsbtbl[i].toss;
@@ -712,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
712 rsb = list_entry(head->next, struct dlm_rsb, 704 rsb = list_entry(head->next, struct dlm_rsb,
713 res_hashchain); 705 res_hashchain);
714 list_del(&rsb->res_hashchain); 706 list_del(&rsb->res_hashchain);
715 free_rsb(rsb); 707 dlm_free_rsb(rsb);
716 } 708 }
717 } 709 }
718 710
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e9923ca9c2d9..7c1e5e5cccd8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
864static void tcp_connect_to_sock(struct connection *con) 864static void tcp_connect_to_sock(struct connection *con)
865{ 865{
866 int result = -EHOSTUNREACH; 866 int result = -EHOSTUNREACH;
867 struct sockaddr_storage saddr; 867 struct sockaddr_storage saddr, src_addr;
868 int addr_len; 868 int addr_len;
869 struct socket *sock; 869 struct socket *sock;
870 870
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
898 con->connect_action = tcp_connect_to_sock; 898 con->connect_action = tcp_connect_to_sock;
899 add_sock(sock, con); 899 add_sock(sock, con);
900 900
901 /* Bind to our cluster-known address connecting to avoid
902 routing problems */
903 memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
904 make_sockaddr(&src_addr, 0, &addr_len);
905 result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
906 addr_len);
907 if (result < 0) {
908 log_print("could not bind for connect: %d", result);
909 /* This *may* not indicate a critical error */
910 }
911
901 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 912 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
902 913
903 log_print("connecting to %d", con->nodeid); 914 log_print("connecting to %d", con->nodeid);
@@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void)
1426 con = __nodeid2con(i, 0); 1437 con = __nodeid2con(i, 0);
1427 if (con) { 1438 if (con) {
1428 close_connection(con, true); 1439 close_connection(con, true);
1440 if (con->othercon)
1441 kmem_cache_free(con_cache, con->othercon);
1429 kmem_cache_free(con_cache, con); 1442 kmem_cache_free(con_cache, con);
1430 } 1443 }
1431 } 1444 }
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907f2386..58487fb95a4c 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
18#include "memory.h" 18#include "memory.h"
19#include "config.h" 19#include "config.h"
20 20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28int dlm_netlink_init(void);
29void dlm_netlink_exit(void);
30
31static int __init init_dlm(void) 21static int __init init_dlm(void)
32{ 22{
33 int error; 23 int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab306e2..fa17f5a27883 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70 ls->ls_num_nodes--; 70 ls->ls_num_nodes--;
71} 71}
72 72
73static int dlm_is_member(struct dlm_ls *ls, int nodeid) 73int dlm_is_member(struct dlm_ls *ls, int nodeid)
74{ 74{
75 struct dlm_member *memb; 75 struct dlm_member *memb;
76 76
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c19214..7a26fca1e0b5 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls); 19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid); 21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22int dlm_is_member(struct dlm_ls *ls, int nodeid);
22 23
23#endif /* __MEMBER_DOT_H__ */ 24#endif /* __MEMBER_DOT_H__ */
24 25
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5cb2035..f7783867491a 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
35 kmem_cache_destroy(lkb_cache); 35 kmem_cache_destroy(lkb_cache);
36} 36}
37 37
38char *allocate_lvb(struct dlm_ls *ls) 38char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
43 return p; 43 return p;
44} 44}
45 45
46void free_lvb(char *p) 46void dlm_free_lvb(char *p)
47{ 47{
48 kfree(p); 48 kfree(p);
49} 49}
@@ -51,7 +51,7 @@ void free_lvb(char *p)
51/* FIXME: have some minimal space built-in to rsb for the name and 51/* FIXME: have some minimal space built-in to rsb for the name and
52 kmalloc a separate name if needed, like dentries are done */ 52 kmalloc a separate name if needed, like dentries are done */
53 53
54struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) 54struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
55{ 55{
56 struct dlm_rsb *r; 56 struct dlm_rsb *r;
57 57
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
61 return r; 61 return r;
62} 62}
63 63
64void free_rsb(struct dlm_rsb *r) 64void dlm_free_rsb(struct dlm_rsb *r)
65{ 65{
66 if (r->res_lvbptr) 66 if (r->res_lvbptr)
67 free_lvb(r->res_lvbptr); 67 dlm_free_lvb(r->res_lvbptr);
68 kfree(r); 68 kfree(r);
69} 69}
70 70
71struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) 71struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76 return lkb; 76 return lkb;
77} 77}
78 78
79void free_lkb(struct dlm_lkb *lkb) 79void dlm_free_lkb(struct dlm_lkb *lkb)
80{ 80{
81 if (lkb->lkb_flags & DLM_IFL_USER) { 81 if (lkb->lkb_flags & DLM_IFL_USER) {
82 struct dlm_user_args *ua; 82 struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
90 kmem_cache_free(lkb_cache, lkb); 90 kmem_cache_free(lkb_cache, lkb);
91} 91}
92 92
93struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
94{
95 struct dlm_direntry *de;
96
97 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
98 printk("namelen = %d\n", namelen););
99
100 de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
101 return de;
102}
103
104void free_direntry(struct dlm_direntry *de)
105{
106 kfree(de);
107}
108
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158ccc5c..485fb29143bd 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
16 16
17int dlm_memory_init(void); 17int dlm_memory_init(void);
18void dlm_memory_exit(void); 18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); 19struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r); 20void dlm_free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); 21struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l); 22void dlm_free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); 23char *dlm_allocate_lvb(struct dlm_ls *ls);
24void free_direntry(struct dlm_direntry *de); 24void dlm_free_lvb(char *l);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27 25
28#endif /* __MEMORY_DOT_H__ */ 26#endif /* __MEMORY_DOT_H__ */
29 27
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dda16a0..e69926e984db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
58int dlm_process_incoming_buffer(int nodeid, const void *base, 58int dlm_process_incoming_buffer(int nodeid, const void *base,
59 unsigned offset, unsigned len, unsigned limit) 59 unsigned offset, unsigned len, unsigned limit)
60{ 60{
61 unsigned char __tmp[DLM_INBUF_LEN]; 61 union {
62 struct dlm_header *msg = (struct dlm_header *) __tmp; 62 unsigned char __buf[DLM_INBUF_LEN];
63 /* this is to force proper alignment on some arches */
64 struct dlm_header dlm;
65 } __tmp;
66 struct dlm_header *msg = &__tmp.dlm;
63 int ret = 0; 67 int ret = 0;
64 int err = 0; 68 int err = 0;
65 uint16_t msglen; 69 uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
100 in the buffer on the stack (which should work for most 104 in the buffer on the stack (which should work for most
101 ordinary messages). */ 105 ordinary messages). */
102 106
103 if (msglen > sizeof(__tmp) && 107 if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
104 msg == (struct dlm_header *) __tmp) {
105 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 108 msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
106 if (msg == NULL) 109 if (msg == NULL)
107 return ret; 110 return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
119 dlm_receive_buffer(msg, nodeid); 122 dlm_receive_buffer(msg, nodeid);
120 } 123 }
121 124
122 if (msg != (struct dlm_header *) __tmp) 125 if (msg != &__tmp.dlm)
123 kfree(msg); 126 kfree(msg);
124 127
125 return err ? err : ret; 128 return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97fa4ad..026824cd3acb 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
197 spin_unlock(&ls->ls_rcom_spin); 197 spin_unlock(&ls->ls_rcom_spin);
198} 198}
199 199
200static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
201{
202 receive_sync_reply(ls, rc_in);
203}
204
205int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) 200int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
206{ 201{
207 struct dlm_rcom *rc; 202 struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
254 send_rcom(ls, mh, rc); 249 send_rcom(ls, mh, rc);
255} 250}
256 251
257static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
258{
259 receive_sync_reply(ls, rc_in);
260}
261
262int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) 252int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
263{ 253{
264 struct dlm_rcom *rc; 254 struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
381 send_rcom(ls, mh, rc); 371 send_rcom(ls, mh, rc);
382} 372}
383 373
384static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
385{
386 dlm_recover_process_copy(ls, rc_in);
387}
388
389/* If the lockspace doesn't exist then still send a status message 374/* If the lockspace doesn't exist then still send a status message
390 back; it's possible that it just doesn't have its global_id yet. */ 375 back; it's possible that it just doesn't have its global_id yet. */
391 376
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
481 break; 466 break;
482 467
483 case DLM_RCOM_STATUS_REPLY: 468 case DLM_RCOM_STATUS_REPLY:
484 receive_rcom_status_reply(ls, rc); 469 receive_sync_reply(ls, rc);
485 break; 470 break;
486 471
487 case DLM_RCOM_NAMES_REPLY: 472 case DLM_RCOM_NAMES_REPLY:
488 receive_rcom_names_reply(ls, rc); 473 receive_sync_reply(ls, rc);
489 break; 474 break;
490 475
491 case DLM_RCOM_LOOKUP_REPLY: 476 case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
493 break; 478 break;
494 479
495 case DLM_RCOM_LOCK_REPLY: 480 case DLM_RCOM_LOCK_REPLY:
496 receive_rcom_lock_reply(ls, rc); 481 dlm_recover_process_copy(ls, rc);
497 break; 482 break;
498 483
499 default: 484 default:
500 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 485 log_error(ls, "receive_rcom bad type %d", rc->rc_type);
501 } 486 }
502 out: 487 out:
503 return; 488 return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc7694cd16..df075dc300fa 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
629 goto out; 629 goto out;
630 630
631 if (!r->res_lvbptr) { 631 if (!r->res_lvbptr) {
632 r->res_lvbptr = allocate_lvb(r->res_ls); 632 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
633 if (!r->res_lvbptr) 633 if (!r->res_lvbptr)
634 goto out; 634 goto out;
635 } 635 }
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
731 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
733 } 733 }
734
735 /* If we're using a directory, add tossed rsbs to the root
736 list; they'll have entries created in the new directory,
737 but no other recovery steps should do anything with them. */
738
739 if (dlm_no_directory(ls)) {
740 read_unlock(&ls->ls_rsbtbl[i].lock);
741 continue;
742 }
743
744 list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r);
747 }
734 read_unlock(&ls->ls_rsbtbl[i].lock); 748 read_unlock(&ls->ls_rsbtbl[i].lock);
735 } 749 }
736 out: 750 out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
750 up_write(&ls->ls_root_sem); 764 up_write(&ls->ls_root_sem);
751} 765}
752 766
767/* If not using a directory, clear the entire toss list, there's no benefit to
768 caching the master value since it's fixed. If we are using a dir, keep the
769 rsb's we're the master of. Recovery will add them to the root list and from
770 there they'll be entered in the rebuilt directory. */
771
753void dlm_clear_toss_list(struct dlm_ls *ls) 772void dlm_clear_toss_list(struct dlm_ls *ls)
754{ 773{
755 struct dlm_rsb *r, *safe; 774 struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
759 write_lock(&ls->ls_rsbtbl[i].lock); 778 write_lock(&ls->ls_rsbtbl[i].lock);
760 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
761 res_hashchain) { 780 res_hashchain) {
762 list_del(&r->res_hashchain); 781 if (dlm_no_directory(ls) || !is_master(r)) {
763 free_rsb(r); 782 list_del(&r->res_hashchain);
783 dlm_free_rsb(r);
784 }
764 } 785 }
765 write_unlock(&ls->ls_rsbtbl[i].lock); 786 write_unlock(&ls->ls_rsbtbl[i].lock);
766 } 787 }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20eebe7..997f9531d594 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
67 dlm_astd_resume(); 67 dlm_astd_resume();
68 68
69 /* 69 /*
70 * This list of root rsb's will be the basis of most of the recovery 70 * Free non-master tossed rsb's. Master rsb's are kept on toss
71 * routines. 71 * list and put on root list to be included in resdir recovery.
72 */ 72 */
73 73
74 dlm_create_root_list(ls); 74 dlm_clear_toss_list(ls);
75 75
76 /* 76 /*
77 * Free all the tossed rsb's so we don't have to recover them. 77 * This list of root rsb's will be the basis of most of the recovery
78 * routines.
78 */ 79 */
79 80
80 dlm_clear_toss_list(ls); 81 dlm_create_root_list(ls);
81 82
82 /* 83 /*
83 * Add or remove nodes from the lockspace's ls_nodes list. 84 * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f741546f4bb..7cbc6826239b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
24#include "lvb_table.h" 24#include "lvb_table.h"
25#include "user.h" 25#include "user.h"
26 26
27static const char *name_prefix="dlm"; 27static const char name_prefix[] = "dlm";
28static struct miscdevice ctl_device;
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
30 29
31#ifdef CONFIG_COMPAT 30#ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@ struct dlm_lock_result32 {
82}; 81};
83 82
84static void compat_input(struct dlm_write_request *kb, 83static void compat_input(struct dlm_write_request *kb,
85 struct dlm_write_request32 *kb32) 84 struct dlm_write_request32 *kb32,
85 int max_namelen)
86{ 86{
87 kb->version[0] = kb32->version[0]; 87 kb->version[0] = kb32->version[0];
88 kb->version[1] = kb32->version[1]; 88 kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb,
112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; 112 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; 113 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); 114 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
115 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); 115 if (kb->i.lock.namelen <= max_namelen)
116 memcpy(kb->i.lock.name, kb32->i.lock.name,
117 kb->i.lock.namelen);
118 else
119 kb->i.lock.namelen = max_namelen;
116 } 120 }
117} 121}
118 122
@@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
236 spin_unlock(&proc->asts_spin); 240 spin_unlock(&proc->asts_spin);
237 241
238 if (eol) { 242 if (eol) {
239 spin_lock(&ua->proc->locks_spin); 243 spin_lock(&proc->locks_spin);
240 if (!list_empty(&lkb->lkb_ownqueue)) { 244 if (!list_empty(&lkb->lkb_ownqueue)) {
241 list_del_init(&lkb->lkb_ownqueue); 245 list_del_init(&lkb->lkb_ownqueue);
242 dlm_put_lkb(lkb); 246 dlm_put_lkb(lkb);
243 } 247 }
244 spin_unlock(&ua->proc->locks_spin); 248 spin_unlock(&proc->locks_spin);
245 } 249 }
246 out: 250 out:
247 mutex_unlock(&ls->ls_clear_proc_locks); 251 mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
529 533
530 if (proc) 534 if (proc)
531 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); 535 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
532 compat_input(kbuf, k32buf); 536 compat_input(kbuf, k32buf,
537 count - sizeof(struct dlm_write_request32));
533 kfree(k32buf); 538 kfree(k32buf);
534 } 539 }
535#endif 540#endif
@@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
896 .owner = THIS_MODULE, 901 .owner = THIS_MODULE,
897}; 902};
898 903
904static struct miscdevice ctl_device = {
905 .name = "dlm-control",
906 .fops = &ctl_device_fops,
907 .minor = MISC_DYNAMIC_MINOR,
908};
909
899int dlm_user_init(void) 910int dlm_user_init(void)
900{ 911{
901 int error; 912 int error;
902 913
903 ctl_device.name = "dlm-control";
904 ctl_device.fops = &ctl_device_fops;
905 ctl_device.minor = MISC_DYNAMIC_MINOR;
906
907 error = misc_register(&ctl_device); 914 error = misc_register(&ctl_device);
908 if (error) 915 if (error)
909 log_print("misc_register failed for control device"); 916 log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889cf6740..4d9c1f4e1bd1 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
14#include "rcom.h" 14#include "rcom.h"
15#include "util.h" 15#include "util.h"
16 16
17#define DLM_ERRNO_EDEADLK 35
18#define DLM_ERRNO_EBADR 53
19#define DLM_ERRNO_EBADSLT 57
20#define DLM_ERRNO_EPROTO 71
21#define DLM_ERRNO_EOPNOTSUPP 95
22#define DLM_ERRNO_ETIMEDOUT 110
23#define DLM_ERRNO_EINPROGRESS 115
24
17static void header_out(struct dlm_header *hd) 25static void header_out(struct dlm_header *hd)
18{ 26{
19 hd->h_version = cpu_to_le32(hd->h_version); 27 hd->h_version = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd)
30 hd->h_length = le16_to_cpu(hd->h_length); 38 hd->h_length = le16_to_cpu(hd->h_length);
31} 39}
32 40
33void dlm_message_out(struct dlm_message *ms) 41/* higher errno values are inconsistent across architectures, so select
42 one set of values for on the wire */
43
44static int to_dlm_errno(int err)
45{
46 switch (err) {
47 case -EDEADLK:
48 return -DLM_ERRNO_EDEADLK;
49 case -EBADR:
50 return -DLM_ERRNO_EBADR;
51 case -EBADSLT:
52 return -DLM_ERRNO_EBADSLT;
53 case -EPROTO:
54 return -DLM_ERRNO_EPROTO;
55 case -EOPNOTSUPP:
56 return -DLM_ERRNO_EOPNOTSUPP;
57 case -ETIMEDOUT:
58 return -DLM_ERRNO_ETIMEDOUT;
59 case -EINPROGRESS:
60 return -DLM_ERRNO_EINPROGRESS;
61 }
62 return err;
63}
64
65static int from_dlm_errno(int err)
34{ 66{
35 struct dlm_header *hd = (struct dlm_header *) ms; 67 switch (err) {
68 case -DLM_ERRNO_EDEADLK:
69 return -EDEADLK;
70 case -DLM_ERRNO_EBADR:
71 return -EBADR;
72 case -DLM_ERRNO_EBADSLT:
73 return -EBADSLT;
74 case -DLM_ERRNO_EPROTO:
75 return -EPROTO;
76 case -DLM_ERRNO_EOPNOTSUPP:
77 return -EOPNOTSUPP;
78 case -DLM_ERRNO_ETIMEDOUT:
79 return -ETIMEDOUT;
80 case -DLM_ERRNO_EINPROGRESS:
81 return -EINPROGRESS;
82 }
83 return err;
84}
36 85
37 header_out(hd); 86void dlm_message_out(struct dlm_message *ms)
87{
88 header_out(&ms->m_header);
38 89
39 ms->m_type = cpu_to_le32(ms->m_type); 90 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid); 91 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms)
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode); 104 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode); 105 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts); 106 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result); 107 ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
57} 108}
58 109
59void dlm_message_in(struct dlm_message *ms) 110void dlm_message_in(struct dlm_message *ms)
60{ 111{
61 struct dlm_header *hd = (struct dlm_header *) ms; 112 header_in(&ms->m_header);
62
63 header_in(hd);
64 113
65 ms->m_type = le32_to_cpu(ms->m_type); 114 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid); 115 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms)
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode); 128 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode); 129 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts); 130 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result); 131 ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
83} 132}
84 133
85static void rcom_lock_out(struct rcom_lock *rl) 134static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
126 175
127void dlm_rcom_out(struct dlm_rcom *rc) 176void dlm_rcom_out(struct dlm_rcom *rc)
128{ 177{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type; 178 int type = rc->rc_type;
131 179
132 header_out(hd); 180 header_out(&rc->rc_header);
133 181
134 rc->rc_type = cpu_to_le32(rc->rc_type); 182 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result); 183 rc->rc_result = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
137 rc->rc_seq = cpu_to_le64(rc->rc_seq); 185 rc->rc_seq = cpu_to_le64(rc->rc_seq);
138 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); 186 rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
139 187
140 if (type == DLM_RCOM_LOCK) 188 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
141 rcom_lock_out((struct rcom_lock *) rc->rc_buf); 189 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
142 190
143 else if (type == DLM_RCOM_STATUS_REPLY) 191 else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
146 194
147void dlm_rcom_in(struct dlm_rcom *rc) 195void dlm_rcom_in(struct dlm_rcom *rc)
148{ 196{
149 struct dlm_header *hd = (struct dlm_header *) rc; 197 int type;
150 198
151 header_in(hd); 199 header_in(&rc->rc_header);
152 200
153 rc->rc_type = le32_to_cpu(rc->rc_type); 201 rc->rc_type = le32_to_cpu(rc->rc_type);
154 rc->rc_result = le32_to_cpu(rc->rc_result); 202 rc->rc_result = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
156 rc->rc_seq = le64_to_cpu(rc->rc_seq); 204 rc->rc_seq = le64_to_cpu(rc->rc_seq);
157 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); 205 rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
158 206
159 if (rc->rc_type == DLM_RCOM_LOCK) 207 type = rc->rc_type;
208
209 if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
160 rcom_lock_in((struct rcom_lock *) rc->rc_buf); 210 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
161 211
162 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) 212 else if (type == DLM_RCOM_STATUS_REPLY)
163 rcom_config_in((struct rcom_config *) rc->rc_buf); 213 rcom_config_in((struct rcom_config *) rc->rc_buf);
164} 214}
165 215
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0f69c416eebc..a5432bbbfb88 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
347 break; 347 break;
348 } 348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){ 350 if (!retry && (need_resched() ||
351 spin_needbreak(&journal->j_list_lock))) {
351 spin_unlock(&journal->j_list_lock); 352 spin_unlock(&journal->j_list_lock);
352 retry = 1; 353 retry = 1;
353 break; 354 break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b99a8e..31853eb65b4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
265 put_bh(bh); 265 put_bh(bh);
266 } 266 }
267 267
268 if (lock_need_resched(&journal->j_list_lock)) { 268 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
269 spin_unlock(&journal->j_list_lock); 269 spin_unlock(&journal->j_list_lock);
270 goto write_out_data; 270 goto write_out_data;
271 } 271 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1b7f282c1ae9..6914598022ce 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -353,7 +353,8 @@ restart:
353 } 353 }
354 retry = __process_buffer(journal, jh, bhs, &batch_count, 354 retry = __process_buffer(journal, jh, bhs, &batch_count,
355 transaction); 355 transaction);
356 if (!retry && lock_need_resched(&journal->j_list_lock)){ 356 if (!retry && (need_resched() ||
357 spin_needbreak(&journal->j_list_lock))) {
357 spin_unlock(&journal->j_list_lock); 358 spin_unlock(&journal->j_list_lock);
358 retry = 1; 359 retry = 1;
359 break; 360 break;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index da8d0eb3b7b9..4f302d279279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -341,7 +341,7 @@ write_out_data:
341 put_bh(bh); 341 put_bh(bh);
342 } 342 }
343 343
344 if (lock_need_resched(&journal->j_list_lock)) { 344 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
345 spin_unlock(&journal->j_list_lock); 345 spin_unlock(&journal->j_list_lock);
346 goto write_out_data; 346 goto write_out_data;
347 } 347 }
diff --git a/include/acpi/reboot.h b/include/acpi/reboot.h
new file mode 100644
index 000000000000..8857f57e0b78
--- /dev/null
+++ b/include/acpi/reboot.h
@@ -0,0 +1,9 @@
1
2/*
3 * Dummy placeholder to make the EFI patches apply to the x86 tree.
4 * Andrew/Len, please just kill this file if you encounter it.
5 */
6#ifndef acpi_reboot
7# define acpi_reboot() do { } while (0)
8#endif
9
diff --git a/include/asm-alpha/agp.h b/include/asm-alpha/agp.h
index ef855a3bc0f5..26c179135293 100644
--- a/include/asm-alpha/agp.h
+++ b/include/asm-alpha/agp.h
@@ -7,7 +7,6 @@
7 7
8#define map_page_into_agp(page) 8#define map_page_into_agp(page)
9#define unmap_page_from_agp(page) 9#define unmap_page_from_agp(page)
10#define flush_agp_mappings()
11#define flush_agp_cache() mb() 10#define flush_agp_cache() mb()
12 11
13/* Convert a physical address to an address suitable for the GART. */ 12/* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index d56fedbb457a..2632328d8646 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -31,14 +31,19 @@ struct bug_entry {
31#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0) 31#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0)
32#endif 32#endif
33 33
34#ifndef HAVE_ARCH_WARN_ON 34#ifndef __WARN
35#ifndef __ASSEMBLY__
36extern void warn_on_slowpath(const char *file, const int line);
37#define WANT_WARN_ON_SLOWPATH
38#endif
39#define __WARN() warn_on_slowpath(__FILE__, __LINE__)
40#endif
41
42#ifndef WARN_ON
35#define WARN_ON(condition) ({ \ 43#define WARN_ON(condition) ({ \
36 int __ret_warn_on = !!(condition); \ 44 int __ret_warn_on = !!(condition); \
37 if (unlikely(__ret_warn_on)) { \ 45 if (unlikely(__ret_warn_on)) \
38 printk("WARNING: at %s:%d %s()\n", __FILE__, \ 46 __WARN(); \
39 __LINE__, __FUNCTION__); \
40 dump_stack(); \
41 } \
42 unlikely(__ret_warn_on); \ 47 unlikely(__ret_warn_on); \
43}) 48})
44#endif 49#endif
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index d85172e9ed45..4b8d31cda1a0 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -3,54 +3,79 @@
3#include <linux/compiler.h> 3#include <linux/compiler.h>
4#include <linux/threads.h> 4#include <linux/threads.h>
5 5
6#define __GENERIC_PER_CPU 6/*
7 * Determine the real variable name from the name visible in the
8 * kernel sources.
9 */
10#define per_cpu_var(var) per_cpu__##var
11
7#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
8 13
14/*
15 * per_cpu_offset() is the offset that has to be added to a
16 * percpu variable to get to the instance for a certain processor.
17 *
18 * Most arches use the __per_cpu_offset array for those offsets but
19 * some arches have their own ways of determining the offset (x86_64, s390).
20 */
21#ifndef __per_cpu_offset
9extern unsigned long __per_cpu_offset[NR_CPUS]; 22extern unsigned long __per_cpu_offset[NR_CPUS];
10 23
11#define per_cpu_offset(x) (__per_cpu_offset[x]) 24#define per_cpu_offset(x) (__per_cpu_offset[x])
25#endif
12 26
13/* Separate out the type, so (int[3], foo) works. */ 27/*
14#define DEFINE_PER_CPU(type, name) \ 28 * Determine the offset for the currently active processor.
15 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name 29 * An arch may define __my_cpu_offset to provide a more effective
16 30 * means of obtaining the offset to the per cpu variables of the
17#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 31 * current processor.
18 __attribute__((__section__(".data.percpu.shared_aligned"))) \ 32 */
19 __typeof__(type) per_cpu__##name \ 33#ifndef __my_cpu_offset
20 ____cacheline_aligned_in_smp 34#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
21 35#define my_cpu_offset per_cpu_offset(smp_processor_id())
22/* var is in discarded region: offset to particular copy we want */ 36#else
23#define per_cpu(var, cpu) (*({ \ 37#define my_cpu_offset __my_cpu_offset
24 extern int simple_identifier_##var(void); \ 38#endif
25 RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); })) 39
26#define __get_cpu_var(var) per_cpu(var, smp_processor_id()) 40/*
27#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id()) 41 * Add a offset to a pointer but keep the pointer as is.
28 42 *
29/* A macro to avoid #include hell... */ 43 * Only S390 provides its own means of moving the pointer.
30#define percpu_modcopy(pcpudst, src, size) \ 44 */
31do { \ 45#ifndef SHIFT_PERCPU_PTR
32 unsigned int __i; \ 46#define SHIFT_PERCPU_PTR(__p, __offset) RELOC_HIDE((__p), (__offset))
33 for_each_possible_cpu(__i) \ 47#endif
34 memcpy((pcpudst)+__per_cpu_offset[__i], \
35 (src), (size)); \
36} while (0)
37#else /* ! SMP */
38 48
39#define DEFINE_PER_CPU(type, name) \ 49/*
40 __typeof__(type) per_cpu__##name 50 * A percpu variable may point to a discarded regions. The following are
51 * established ways to produce a usable pointer from the percpu variable
52 * offset.
53 */
54#define per_cpu(var, cpu) \
55 (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
56#define __get_cpu_var(var) \
57 (*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
58#define __raw_get_cpu_var(var) \
59 (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
41 60
42#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
43 DEFINE_PER_CPU(type, name)
44 61
45#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 62#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
46#define __get_cpu_var(var) per_cpu__##var 63extern void setup_per_cpu_areas(void);
47#define __raw_get_cpu_var(var) per_cpu__##var 64#endif
65
66#else /* ! SMP */
67
68#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var)))
69#define __get_cpu_var(var) per_cpu_var(var)
70#define __raw_get_cpu_var(var) per_cpu_var(var)
48 71
49#endif /* SMP */ 72#endif /* SMP */
50 73
51#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name 74#ifndef PER_CPU_ATTRIBUTES
75#define PER_CPU_ATTRIBUTES
76#endif
52 77
53#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) 78#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
54#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) 79 __typeof__(type) per_cpu_var(name)
55 80
56#endif /* _ASM_GENERIC_PERCPU_H_ */ 81#endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 75f2bfab614f..6ce9f3ab928d 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -15,7 +15,6 @@
15 15
16#include <linux/swap.h> 16#include <linux/swap.h>
17#include <linux/quicklist.h> 17#include <linux/quicklist.h>
18#include <asm/pgalloc.h>
19#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
20 19
21/* 20/*
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 76df771be585..f784d2f34149 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -184,6 +184,7 @@
184 VMLINUX_SYMBOL(__start___param) = .; \ 184 VMLINUX_SYMBOL(__start___param) = .; \
185 *(__param) \ 185 *(__param) \
186 VMLINUX_SYMBOL(__stop___param) = .; \ 186 VMLINUX_SYMBOL(__stop___param) = .; \
187 . = ALIGN((align)); \
187 VMLINUX_SYMBOL(__end_rodata) = .; \ 188 VMLINUX_SYMBOL(__end_rodata) = .; \
188 } \ 189 } \
189 . = ALIGN((align)); 190 . = ALIGN((align));
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
index 81bcd5e51789..cd1cc39b5599 100644
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -127,6 +127,8 @@ extern int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
127extern int __initdata nid_to_pxm_map[MAX_NUMNODES]; 127extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
128#endif 128#endif
129 129
130#define acpi_unlazy_tlb(x)
131
130#endif /*__KERNEL__*/ 132#endif /*__KERNEL__*/
131 133
132#endif /*_ASM_ACPI_H*/ 134#endif /*_ASM_ACPI_H*/
diff --git a/include/asm-ia64/agp.h b/include/asm-ia64/agp.h
index 4e517f0e6afa..c11fdd8ab4d7 100644
--- a/include/asm-ia64/agp.h
+++ b/include/asm-ia64/agp.h
@@ -15,7 +15,6 @@
15 */ 15 */
16#define map_page_into_agp(page) /* nothing */ 16#define map_page_into_agp(page) /* nothing */
17#define unmap_page_from_agp(page) /* nothing */ 17#define unmap_page_from_agp(page) /* nothing */
18#define flush_agp_mappings() /* nothing */
19#define flush_agp_cache() mb() 18#define flush_agp_cache() mb()
20 19
21/* Convert a physical address to an address suitable for the GART. */ 20/* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
index c4f1e328a5ba..0095bcf79848 100644
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -16,28 +16,11 @@
16#include <linux/threads.h> 16#include <linux/threads.h>
17 17
18#ifdef HAVE_MODEL_SMALL_ATTRIBUTE 18#ifdef HAVE_MODEL_SMALL_ATTRIBUTE
19# define __SMALL_ADDR_AREA __attribute__((__model__ (__small__))) 19# define PER_CPU_ATTRIBUTES __attribute__((__model__ (__small__)))
20#else
21# define __SMALL_ADDR_AREA
22#endif 20#endif
23 21
24#define DECLARE_PER_CPU(type, name) \ 22#define DECLARE_PER_CPU(type, name) \
25 extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name 23 extern PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
26
27/* Separate out the type, so (int[3], foo) works. */
28#define DEFINE_PER_CPU(type, name) \
29 __attribute__((__section__(".data.percpu"))) \
30 __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
31
32#ifdef CONFIG_SMP
33#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
34 __attribute__((__section__(".data.percpu.shared_aligned"))) \
35 __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name \
36 ____cacheline_aligned_in_smp
37#else
38#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
39 DEFINE_PER_CPU(type, name)
40#endif
41 24
42/* 25/*
43 * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an 26 * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
@@ -68,9 +51,6 @@ extern void *per_cpu_init(void);
68 51
69#endif /* SMP */ 52#endif /* SMP */
70 53
71#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
72#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
73
74/* 54/*
75 * Be extremely careful when taking the address of this variable! Due to virtual 55 * Be extremely careful when taking the address of this variable! Due to virtual
76 * remapping, it is different from the canonical address returned by __get_cpu_var(var)! 56 * remapping, it is different from the canonical address returned by __get_cpu_var(var)!
diff --git a/include/asm-m32r/signal.h b/include/asm-m32r/signal.h
index 937258686ba5..1a607066bc64 100644
--- a/include/asm-m32r/signal.h
+++ b/include/asm-m32r/signal.h
@@ -157,7 +157,7 @@ typedef struct sigaltstack {
157#undef __HAVE_ARCH_SIG_BITOPS 157#undef __HAVE_ARCH_SIG_BITOPS
158 158
159struct pt_regs; 159struct pt_regs;
160extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset)); 160extern int do_signal(struct pt_regs *regs, sigset_t *oldset);
161 161
162#define ptrace_signal_deliver(regs, cookie) do { } while (0) 162#define ptrace_signal_deliver(regs, cookie) do { } while (0)
163 163
diff --git a/include/asm-parisc/agp.h b/include/asm-parisc/agp.h
index 9f61d4eb6c01..9651660da639 100644
--- a/include/asm-parisc/agp.h
+++ b/include/asm-parisc/agp.h
@@ -9,7 +9,6 @@
9 9
10#define map_page_into_agp(page) /* nothing */ 10#define map_page_into_agp(page) /* nothing */
11#define unmap_page_from_agp(page) /* nothing */ 11#define unmap_page_from_agp(page) /* nothing */
12#define flush_agp_mappings() /* nothing */
13#define flush_agp_cache() mb() 12#define flush_agp_cache() mb()
14 13
15/* Convert a physical address to an address suitable for the GART. */ 14/* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-powerpc/agp.h b/include/asm-powerpc/agp.h
index e5ccaca2f5a4..86455c4c31ee 100644
--- a/include/asm-powerpc/agp.h
+++ b/include/asm-powerpc/agp.h
@@ -6,7 +6,6 @@
6 6
7#define map_page_into_agp(page) 7#define map_page_into_agp(page)
8#define unmap_page_from_agp(page) 8#define unmap_page_from_agp(page)
9#define flush_agp_mappings()
10#define flush_agp_cache() mb() 9#define flush_agp_cache() mb()
11 10
12/* Convert a physical address to an address suitable for the GART. */ 11/* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
index 6b229626d3ff..cc1cbf656b02 100644
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -16,15 +16,6 @@
16#define __my_cpu_offset() get_paca()->data_offset 16#define __my_cpu_offset() get_paca()->data_offset
17#define per_cpu_offset(x) (__per_cpu_offset(x)) 17#define per_cpu_offset(x) (__per_cpu_offset(x))
18 18
19/* Separate out the type, so (int[3], foo) works. */
20#define DEFINE_PER_CPU(type, name) \
21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
22
23#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
24 __attribute__((__section__(".data.percpu.shared_aligned"))) \
25 __typeof__(type) per_cpu__##name \
26 ____cacheline_aligned_in_smp
27
28/* var is in discarded region: offset to particular copy we want */ 19/* var is in discarded region: offset to particular copy we want */
29#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) 20#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
30#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) 21#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -43,11 +34,6 @@ extern void setup_per_cpu_areas(void);
43 34
44#else /* ! SMP */ 35#else /* ! SMP */
45 36
46#define DEFINE_PER_CPU(type, name) \
47 __typeof__(type) per_cpu__##name
48#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
49 DEFINE_PER_CPU(type, name)
50
51#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) 37#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
52#define __get_cpu_var(var) per_cpu__##var 38#define __get_cpu_var(var) per_cpu__##var
53#define __raw_get_cpu_var(var) per_cpu__##var 39#define __raw_get_cpu_var(var) per_cpu__##var
@@ -56,9 +42,6 @@ extern void setup_per_cpu_areas(void);
56 42
57#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name 43#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
58 44
59#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
60#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
61
62#else 45#else
63#include <asm-generic/percpu.h> 46#include <asm-generic/percpu.h>
64#endif 47#endif
diff --git a/include/asm-powerpc/ptrace.h b/include/asm-powerpc/ptrace.h
index 13fccc5a4119..3063363f6799 100644
--- a/include/asm-powerpc/ptrace.h
+++ b/include/asm-powerpc/ptrace.h
@@ -119,6 +119,13 @@ do { \
119} while (0) 119} while (0)
120#endif /* __powerpc64__ */ 120#endif /* __powerpc64__ */
121 121
122/*
123 * These are defined as per linux/ptrace.h, which see.
124 */
125#define arch_has_single_step() (1)
126extern void user_enable_single_step(struct task_struct *);
127extern void user_disable_single_step(struct task_struct *);
128
122#endif /* __ASSEMBLY__ */ 129#endif /* __ASSEMBLY__ */
123 130
124#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h
index 545857e64443..2d676a873858 100644
--- a/include/asm-s390/percpu.h
+++ b/include/asm-s390/percpu.h
@@ -4,8 +4,6 @@
4#include <linux/compiler.h> 4#include <linux/compiler.h>
5#include <asm/lowcore.h> 5#include <asm/lowcore.h>
6 6
7#define __GENERIC_PER_CPU
8
9/* 7/*
10 * s390 uses its own implementation for per cpu data, the offset of 8 * s390 uses its own implementation for per cpu data, the offset of
11 * the cpu local data area is cached in the cpu's lowcore memory. 9 * the cpu local data area is cached in the cpu's lowcore memory.
@@ -36,16 +34,6 @@
36 34
37extern unsigned long __per_cpu_offset[NR_CPUS]; 35extern unsigned long __per_cpu_offset[NR_CPUS];
38 36
39/* Separate out the type, so (int[3], foo) works. */
40#define DEFINE_PER_CPU(type, name) \
41 __attribute__((__section__(".data.percpu"))) \
42 __typeof__(type) per_cpu__##name
43
44#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
45 __attribute__((__section__(".data.percpu.shared_aligned"))) \
46 __typeof__(type) per_cpu__##name \
47 ____cacheline_aligned_in_smp
48
49#define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) 37#define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
50#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) 38#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
51#define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) 39#define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -62,11 +50,6 @@ do { \
62 50
63#else /* ! SMP */ 51#else /* ! SMP */
64 52
65#define DEFINE_PER_CPU(type, name) \
66 __typeof__(type) per_cpu__##name
67#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
68 DEFINE_PER_CPU(type, name)
69
70#define __get_cpu_var(var) __reloc_hide(var,0) 53#define __get_cpu_var(var) __reloc_hide(var,0)
71#define __raw_get_cpu_var(var) __reloc_hide(var,0) 54#define __raw_get_cpu_var(var) __reloc_hide(var,0)
72#define per_cpu(var,cpu) __reloc_hide(var,0) 55#define per_cpu(var,cpu) __reloc_hide(var,0)
@@ -75,7 +58,4 @@ do { \
75 58
76#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name 59#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
77 60
78#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
79#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
80
81#endif /* __ARCH_S390_PERCPU__ */ 61#endif /* __ARCH_S390_PERCPU__ */
diff --git a/include/asm-sparc64/agp.h b/include/asm-sparc64/agp.h
index 58f8cb6ae767..e9fcf0e781ea 100644
--- a/include/asm-sparc64/agp.h
+++ b/include/asm-sparc64/agp.h
@@ -5,7 +5,6 @@
5 5
6#define map_page_into_agp(page) 6#define map_page_into_agp(page)
7#define unmap_page_from_agp(page) 7#define unmap_page_from_agp(page)
8#define flush_agp_mappings()
9#define flush_agp_cache() mb() 8#define flush_agp_cache() mb()
10 9
11/* Convert a physical address to an address suitable for the GART. */ 10/* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h
index a1f53a4da405..c7e52decba98 100644
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -16,15 +16,6 @@ extern unsigned long __per_cpu_shift;
16 (__per_cpu_base + ((unsigned long)(__cpu) << __per_cpu_shift)) 16 (__per_cpu_base + ((unsigned long)(__cpu) << __per_cpu_shift))
17#define per_cpu_offset(x) (__per_cpu_offset(x)) 17#define per_cpu_offset(x) (__per_cpu_offset(x))
18 18
19/* Separate out the type, so (int[3], foo) works. */
20#define DEFINE_PER_CPU(type, name) \
21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
22
23#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
24 __attribute__((__section__(".data.percpu.shared_aligned"))) \
25 __typeof__(type) per_cpu__##name \
26 ____cacheline_aligned_in_smp
27
28/* var is in discarded region: offset to particular copy we want */ 19/* var is in discarded region: offset to particular copy we want */
29#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) 20#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
30#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) 21#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset))
@@ -41,10 +32,6 @@ do { \
41#else /* ! SMP */ 32#else /* ! SMP */
42 33
43#define real_setup_per_cpu_areas() do { } while (0) 34#define real_setup_per_cpu_areas() do { } while (0)
44#define DEFINE_PER_CPU(type, name) \
45 __typeof__(type) per_cpu__##name
46#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
47 DEFINE_PER_CPU(type, name)
48 35
49#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) 36#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var))
50#define __get_cpu_var(var) per_cpu__##var 37#define __get_cpu_var(var) per_cpu__##var
@@ -54,7 +41,4 @@ do { \
54 41
55#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name 42#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
56 43
57#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
58#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
59
60#endif /* __ARCH_SPARC64_PERCPU__ */ 44#endif /* __ARCH_SPARC64_PERCPU__ */
diff --git a/include/asm-um/asm.h b/include/asm-um/asm.h
new file mode 100644
index 000000000000..af1269a1e9eb
--- /dev/null
+++ b/include/asm-um/asm.h
@@ -0,0 +1,6 @@
1#ifndef __UM_ASM_H
2#define __UM_ASM_H
3
4#include "asm/arch/asm.h"
5
6#endif
diff --git a/include/asm-um/linkage.h b/include/asm-um/linkage.h
index 78b862472b36..cdb3024a699a 100644
--- a/include/asm-um/linkage.h
+++ b/include/asm-um/linkage.h
@@ -6,7 +6,6 @@
6 6
7/* <linux/linkage.h> will pick sane defaults */ 7/* <linux/linkage.h> will pick sane defaults */
8#ifdef CONFIG_GPROF 8#ifdef CONFIG_GPROF
9#undef FASTCALL
10#undef fastcall 9#undef fastcall
11#endif 10#endif
12 11
diff --git a/include/asm-um/nops.h b/include/asm-um/nops.h
new file mode 100644
index 000000000000..814e9bf5dea6
--- /dev/null
+++ b/include/asm-um/nops.h
@@ -0,0 +1,6 @@
1#ifndef __UM_NOPS_H
2#define __UM_NOPS_H
3
4#include "asm/arch/nops.h"
5
6#endif
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
index 12db5a1cdd74..3c6f0f80e827 100644
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -3,21 +3,20 @@ include include/asm-generic/Kbuild.asm
3header-y += boot.h 3header-y += boot.h
4header-y += bootparam.h 4header-y += bootparam.h
5header-y += debugreg.h 5header-y += debugreg.h
6header-y += kvm.h
6header-y += ldt.h 7header-y += ldt.h
7header-y += msr-index.h 8header-y += msr-index.h
8header-y += prctl.h 9header-y += prctl.h
9header-y += ptrace-abi.h 10header-y += ptrace-abi.h
10header-y += sigcontext32.h 11header-y += sigcontext32.h
11header-y += ucontext.h 12header-y += ucontext.h
12header-y += vsyscall32.h
13 13
14unifdef-y += e820.h 14unifdef-y += e820.h
15unifdef-y += ist.h 15unifdef-y += ist.h
16unifdef-y += mce.h 16unifdef-y += mce.h
17unifdef-y += msr.h 17unifdef-y += msr.h
18unifdef-y += mtrr.h 18unifdef-y += mtrr.h
19unifdef-y += page_32.h 19unifdef-y += page.h
20unifdef-y += page_64.h
21unifdef-y += posix_types_32.h 20unifdef-y += posix_types_32.h
22unifdef-y += posix_types_64.h 21unifdef-y += posix_types_64.h
23unifdef-y += ptrace.h 22unifdef-y += ptrace.h
diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h
index f8a89793ac8c..98a9ca266531 100644
--- a/include/asm-x86/acpi.h
+++ b/include/asm-x86/acpi.h
@@ -1,13 +1,123 @@
1#ifndef _ASM_X86_ACPI_H 1#ifndef _ASM_X86_ACPI_H
2#define _ASM_X86_ACPI_H 2#define _ASM_X86_ACPI_H
3 3
4#ifdef CONFIG_X86_32 4/*
5# include "acpi_32.h" 5 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6#else 6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7# include "acpi_64.h" 7 *
8#endif 8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 */
26#include <acpi/pdc_intel.h>
9 27
28#include <asm/numa.h>
10#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h>
31
32#define COMPILER_DEPENDENT_INT64 long long
33#define COMPILER_DEPENDENT_UINT64 unsigned long long
34
35/*
36 * Calling conventions:
37 *
38 * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
39 * ACPI_EXTERNAL_XFACE - External ACPI interfaces
40 * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
41 * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
42 */
43#define ACPI_SYSTEM_XFACE
44#define ACPI_EXTERNAL_XFACE
45#define ACPI_INTERNAL_XFACE
46#define ACPI_INTERNAL_VAR_XFACE
47
48/* Asm macros */
49
50#define ACPI_ASM_MACROS
51#define BREAKPOINT3
52#define ACPI_DISABLE_IRQS() local_irq_disable()
53#define ACPI_ENABLE_IRQS() local_irq_enable()
54#define ACPI_FLUSH_CPU_CACHE() wbinvd()
55
56int __acpi_acquire_global_lock(unsigned int *lock);
57int __acpi_release_global_lock(unsigned int *lock);
58
59#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
60 ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
61
62#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
63 ((Acq) = __acpi_release_global_lock(&facs->global_lock))
64
65/*
66 * Math helper asm macros
67 */
68#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
69 asm("divl %2;" \
70 :"=a"(q32), "=d"(r32) \
71 :"r"(d32), \
72 "0"(n_lo), "1"(n_hi))
73
74
75#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
76 asm("shrl $1,%2 ;" \
77 "rcrl $1,%3;" \
78 :"=r"(n_hi), "=r"(n_lo) \
79 :"0"(n_hi), "1"(n_lo))
80
81#ifdef CONFIG_ACPI
82extern int acpi_lapic;
83extern int acpi_ioapic;
84extern int acpi_noirq;
85extern int acpi_strict;
86extern int acpi_disabled;
87extern int acpi_ht;
88extern int acpi_pci_disabled;
89extern int acpi_skip_timer_override;
90extern int acpi_use_timer_override;
91
92static inline void disable_acpi(void)
93{
94 acpi_disabled = 1;
95 acpi_ht = 0;
96 acpi_pci_disabled = 1;
97 acpi_noirq = 1;
98}
99
100/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
101#define FIX_ACPI_PAGES 4
102
103extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
104
105static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
106static inline void acpi_disable_pci(void)
107{
108 acpi_pci_disabled = 1;
109 acpi_noirq_set();
110}
111extern int acpi_irq_balance_set(char *str);
112
113/* routines for saving/restoring kernel state */
114extern int acpi_save_state_mem(void);
115extern void acpi_restore_state_mem(void);
116
117extern unsigned long acpi_wakeup_address;
118
119/* early initialization routine */
120extern void acpi_reserve_bootmem(void);
11 121
12/* 122/*
13 * Check if the CPU can handle C2 and deeper 123 * Check if the CPU can handle C2 and deeper
@@ -29,4 +139,35 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
29 return max_cstate; 139 return max_cstate;
30} 140}
31 141
142#else /* !CONFIG_ACPI */
143
144#define acpi_lapic 0
145#define acpi_ioapic 0
146static inline void acpi_noirq_set(void) { }
147static inline void acpi_disable_pci(void) { }
148static inline void disable_acpi(void) { }
149
150#endif /* !CONFIG_ACPI */
151
152#define ARCH_HAS_POWER_INIT 1
153
154struct bootnode;
155
156#ifdef CONFIG_ACPI_NUMA
157extern int acpi_numa;
158extern int acpi_scan_nodes(unsigned long start, unsigned long end);
159#ifdef CONFIG_X86_64
160# define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
161#endif
162extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
163 int num_nodes);
164#else
165static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
166 int num_nodes)
167{
168}
32#endif 169#endif
170
171#define acpi_unlazy_tlb(x) leave_mm(x)
172
173#endif /*__X86_ASM_ACPI_H*/
diff --git a/include/asm-x86/acpi_32.h b/include/asm-x86/acpi_32.h
deleted file mode 100644
index 723493e6c851..000000000000
--- a/include/asm-x86/acpi_32.h
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * asm-i386/acpi.h
3 *
4 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
6 *
7 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#ifndef _ASM_ACPI_H
27#define _ASM_ACPI_H
28
29#ifdef __KERNEL__
30
31#include <acpi/pdc_intel.h>
32
33#include <asm/system.h> /* defines cmpxchg */
34
35#define COMPILER_DEPENDENT_INT64 long long
36#define COMPILER_DEPENDENT_UINT64 unsigned long long
37
38/*
39 * Calling conventions:
40 *
41 * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
42 * ACPI_EXTERNAL_XFACE - External ACPI interfaces
43 * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
44 * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
45 */
46#define ACPI_SYSTEM_XFACE
47#define ACPI_EXTERNAL_XFACE
48#define ACPI_INTERNAL_XFACE
49#define ACPI_INTERNAL_VAR_XFACE
50
51/* Asm macros */
52
53#define ACPI_ASM_MACROS
54#define BREAKPOINT3
55#define ACPI_DISABLE_IRQS() local_irq_disable()
56#define ACPI_ENABLE_IRQS() local_irq_enable()
57#define ACPI_FLUSH_CPU_CACHE() wbinvd()
58
59int __acpi_acquire_global_lock(unsigned int *lock);
60int __acpi_release_global_lock(unsigned int *lock);
61
62#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
63 ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
64
65#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
66 ((Acq) = __acpi_release_global_lock(&facs->global_lock))
67
68/*
69 * Math helper asm macros
70 */
71#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
72 asm("divl %2;" \
73 :"=a"(q32), "=d"(r32) \
74 :"r"(d32), \
75 "0"(n_lo), "1"(n_hi))
76
77
78#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
79 asm("shrl $1,%2;" \
80 "rcrl $1,%3;" \
81 :"=r"(n_hi), "=r"(n_lo) \
82 :"0"(n_hi), "1"(n_lo))
83
84extern void early_quirks(void);
85
86#ifdef CONFIG_ACPI
87extern int acpi_lapic;
88extern int acpi_ioapic;
89extern int acpi_noirq;
90extern int acpi_strict;
91extern int acpi_disabled;
92extern int acpi_ht;
93extern int acpi_pci_disabled;
94static inline void disable_acpi(void)
95{
96 acpi_disabled = 1;
97 acpi_ht = 0;
98 acpi_pci_disabled = 1;
99 acpi_noirq = 1;
100}
101
102/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
103#define FIX_ACPI_PAGES 4
104
105extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
106
107#ifdef CONFIG_X86_IO_APIC
108extern int acpi_skip_timer_override;
109extern int acpi_use_timer_override;
110#endif
111
112static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
113static inline void acpi_disable_pci(void)
114{
115 acpi_pci_disabled = 1;
116 acpi_noirq_set();
117}
118extern int acpi_irq_balance_set(char *str);
119
120/* routines for saving/restoring kernel state */
121extern int acpi_save_state_mem(void);
122extern void acpi_restore_state_mem(void);
123
124extern unsigned long acpi_wakeup_address;
125
126/* early initialization routine */
127extern void acpi_reserve_bootmem(void);
128
129#else /* !CONFIG_ACPI */
130
131#define acpi_lapic 0
132#define acpi_ioapic 0
133static inline void acpi_noirq_set(void) { }
134static inline void acpi_disable_pci(void) { }
135static inline void disable_acpi(void) { }
136
137#endif /* !CONFIG_ACPI */
138
139#define ARCH_HAS_POWER_INIT 1
140
141#endif /*__KERNEL__*/
142
143#endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/acpi_64.h b/include/asm-x86/acpi_64.h
deleted file mode 100644
index 98173357dd89..000000000000
--- a/include/asm-x86/acpi_64.h
+++ /dev/null
@@ -1,153 +0,0 @@
1/*
2 * asm-x86_64/acpi.h
3 *
4 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
6 *
7 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#ifndef _ASM_ACPI_H
27#define _ASM_ACPI_H
28
29#ifdef __KERNEL__
30
31#include <acpi/pdc_intel.h>
32#include <asm/numa.h>
33
34#define COMPILER_DEPENDENT_INT64 long long
35#define COMPILER_DEPENDENT_UINT64 unsigned long long
36
37/*
38 * Calling conventions:
39 *
40 * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
41 * ACPI_EXTERNAL_XFACE - External ACPI interfaces
42 * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
43 * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
44 */
45#define ACPI_SYSTEM_XFACE
46#define ACPI_EXTERNAL_XFACE
47#define ACPI_INTERNAL_XFACE
48#define ACPI_INTERNAL_VAR_XFACE
49
50/* Asm macros */
51
52#define ACPI_ASM_MACROS
53#define BREAKPOINT3
54#define ACPI_DISABLE_IRQS() local_irq_disable()
55#define ACPI_ENABLE_IRQS() local_irq_enable()
56#define ACPI_FLUSH_CPU_CACHE() wbinvd()
57
58int __acpi_acquire_global_lock(unsigned int *lock);
59int __acpi_release_global_lock(unsigned int *lock);
60
61#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
62 ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
63
64#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
65 ((Acq) = __acpi_release_global_lock(&facs->global_lock))
66
67/*
68 * Math helper asm macros
69 */
70#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
71 asm("divl %2;" \
72 :"=a"(q32), "=d"(r32) \
73 :"r"(d32), \
74 "0"(n_lo), "1"(n_hi))
75
76
77#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
78 asm("shrl $1,%2;" \
79 "rcrl $1,%3;" \
80 :"=r"(n_hi), "=r"(n_lo) \
81 :"0"(n_hi), "1"(n_lo))
82
83#ifdef CONFIG_ACPI
84extern int acpi_lapic;
85extern int acpi_ioapic;
86extern int acpi_noirq;
87extern int acpi_strict;
88extern int acpi_disabled;
89extern int acpi_pci_disabled;
90extern int acpi_ht;
91static inline void disable_acpi(void)
92{
93 acpi_disabled = 1;
94 acpi_ht = 0;
95 acpi_pci_disabled = 1;
96 acpi_noirq = 1;
97}
98
99/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
100#define FIX_ACPI_PAGES 4
101
102extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
103static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
104static inline void acpi_disable_pci(void)
105{
106 acpi_pci_disabled = 1;
107 acpi_noirq_set();
108}
109extern int acpi_irq_balance_set(char *str);
110
111/* routines for saving/restoring kernel state */
112extern int acpi_save_state_mem(void);
113extern void acpi_restore_state_mem(void);
114
115extern unsigned long acpi_wakeup_address;
116
117/* early initialization routine */
118extern void acpi_reserve_bootmem(void);
119
120#else /* !CONFIG_ACPI */
121
122#define acpi_lapic 0
123#define acpi_ioapic 0
124static inline void acpi_noirq_set(void) { }
125static inline void acpi_disable_pci(void) { }
126
127#endif /* !CONFIG_ACPI */
128
129extern int acpi_numa;
130extern int acpi_scan_nodes(unsigned long start, unsigned long end);
131#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
132
133extern int acpi_disabled;
134extern int acpi_pci_disabled;
135
136#define ARCH_HAS_POWER_INIT 1
137
138extern int acpi_skip_timer_override;
139extern int acpi_use_timer_override;
140
141#ifdef CONFIG_ACPI_NUMA
142extern void __init acpi_fake_nodes(const struct bootnode *fake_nodes,
143 int num_nodes);
144#else
145static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
146 int num_nodes)
147{
148}
149#endif
150
151#endif /*__KERNEL__*/
152
153#endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/agp.h b/include/asm-x86/agp.h
index 62df2a9e7130..e4004a9f6a9a 100644
--- a/include/asm-x86/agp.h
+++ b/include/asm-x86/agp.h
@@ -12,13 +12,8 @@
12 * page. This avoids data corruption on some CPUs. 12 * page. This avoids data corruption on some CPUs.
13 */ 13 */
14 14
15/* 15#define map_page_into_agp(page) set_pages_uc(page, 1)
16 * Caller's responsibility to call global_flush_tlb() for performance 16#define unmap_page_from_agp(page) set_pages_wb(page, 1)
17 * reasons
18 */
19#define map_page_into_agp(page) change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)
20#define unmap_page_from_agp(page) change_page_attr(page, 1, PAGE_KERNEL)
21#define flush_agp_mappings() global_flush_tlb()
22 17
23/* 18/*
24 * Could use CLFLUSH here if the cpu supports it. But then it would 19 * Could use CLFLUSH here if the cpu supports it. But then it would
diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h
index 9eef6a32a130..d8bacf3c4b08 100644
--- a/include/asm-x86/alternative.h
+++ b/include/asm-x86/alternative.h
@@ -1,5 +1,161 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_ALTERNATIVE_H
2# include "alternative_32.h" 2#define _ASM_X86_ALTERNATIVE_H
3
4#include <linux/types.h>
5#include <linux/stddef.h>
6#include <asm/asm.h>
7
8/*
9 * Alternative inline assembly for SMP.
10 *
11 * The LOCK_PREFIX macro defined here replaces the LOCK and
12 * LOCK_PREFIX macros used everywhere in the source tree.
13 *
14 * SMP alternatives use the same data structures as the other
15 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
16 * UP system running a SMP kernel. The existing apply_alternatives()
17 * works fine for patching a SMP kernel for UP.
18 *
19 * The SMP alternative tables can be kept after boot and contain both
20 * UP and SMP versions of the instructions to allow switching back to
21 * SMP at runtime, when hotplugging in a new CPU, which is especially
22 * useful in virtualized environments.
23 *
24 * The very common lock prefix is handled as special case in a
25 * separate table which is a pure address list without replacement ptr
26 * and size information. That keeps the table sizes small.
27 */
28
29#ifdef CONFIG_SMP
30#define LOCK_PREFIX \
31 ".section .smp_locks,\"a\"\n" \
32 _ASM_ALIGN "\n" \
33 _ASM_PTR "661f\n" /* address */ \
34 ".previous\n" \
35 "661:\n\tlock; "
36
37#else /* ! CONFIG_SMP */
38#define LOCK_PREFIX ""
39#endif
40
41/* This must be included *after* the definition of LOCK_PREFIX */
42#include <asm/cpufeature.h>
43
44struct alt_instr {
45 u8 *instr; /* original instruction */
46 u8 *replacement;
47 u8 cpuid; /* cpuid bit set for replacement */
48 u8 instrlen; /* length of original instruction */
49 u8 replacementlen; /* length of new instruction, <= instrlen */
50 u8 pad1;
51#ifdef CONFIG_X86_64
52 u32 pad2;
53#endif
54};
55
56extern void alternative_instructions(void);
57extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
58
59struct module;
60
61#ifdef CONFIG_SMP
62extern void alternatives_smp_module_add(struct module *mod, char *name,
63 void *locks, void *locks_end,
64 void *text, void *text_end);
65extern void alternatives_smp_module_del(struct module *mod);
66extern void alternatives_smp_switch(int smp);
67#else
68static inline void alternatives_smp_module_add(struct module *mod, char *name,
69 void *locks, void *locks_end,
70 void *text, void *text_end) {}
71static inline void alternatives_smp_module_del(struct module *mod) {}
72static inline void alternatives_smp_switch(int smp) {}
73#endif /* CONFIG_SMP */
74
75/*
76 * Alternative instructions for different CPU types or capabilities.
77 *
78 * This allows to use optimized instructions even on generic binary
79 * kernels.
80 *
81 * length of oldinstr must be longer or equal the length of newinstr
82 * It can be padded with nops as needed.
83 *
84 * For non barrier like inlines please define new variants
85 * without volatile and memory clobber.
86 */
87#define alternative(oldinstr, newinstr, feature) \
88 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89 ".section .altinstructions,\"a\"\n" \
90 _ASM_ALIGN "\n" \
91 _ASM_PTR "661b\n" /* label */ \
92 _ASM_PTR "663f\n" /* new instruction */ \
93 " .byte %c0\n" /* feature bit */ \
94 " .byte 662b-661b\n" /* sourcelen */ \
95 " .byte 664f-663f\n" /* replacementlen */ \
96 ".previous\n" \
97 ".section .altinstr_replacement,\"ax\"\n" \
98 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
99 ".previous" :: "i" (feature) : "memory")
100
101/*
102 * Alternative inline assembly with input.
103 *
104 * Pecularities:
105 * No memory clobber here.
106 * Argument numbers start with 1.
107 * Best is to use constraints that are fixed size (like (%1) ... "r")
108 * If you use variable sized constraints like "m" or "g" in the
109 * replacement make sure to pad to the worst case length.
110 */
111#define alternative_input(oldinstr, newinstr, feature, input...) \
112 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
113 ".section .altinstructions,\"a\"\n" \
114 _ASM_ALIGN "\n" \
115 _ASM_PTR "661b\n" /* label */ \
116 _ASM_PTR "663f\n" /* new instruction */ \
117 " .byte %c0\n" /* feature bit */ \
118 " .byte 662b-661b\n" /* sourcelen */ \
119 " .byte 664f-663f\n" /* replacementlen */ \
120 ".previous\n" \
121 ".section .altinstr_replacement,\"ax\"\n" \
122 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
123 ".previous" :: "i" (feature), ##input)
124
125/* Like alternative_input, but with a single output argument */
126#define alternative_io(oldinstr, newinstr, feature, output, input...) \
127 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
128 ".section .altinstructions,\"a\"\n" \
129 _ASM_ALIGN "\n" \
130 _ASM_PTR "661b\n" /* label */ \
131 _ASM_PTR "663f\n" /* new instruction */ \
132 " .byte %c[feat]\n" /* feature bit */ \
133 " .byte 662b-661b\n" /* sourcelen */ \
134 " .byte 664f-663f\n" /* replacementlen */ \
135 ".previous\n" \
136 ".section .altinstr_replacement,\"ax\"\n" \
137 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
138 ".previous" : output : [feat] "i" (feature), ##input)
139
140/*
141 * use this macro(s) if you need more than one output parameter
142 * in alternative_io
143 */
144#define ASM_OUTPUT2(a, b) a, b
145
146struct paravirt_patch_site;
147#ifdef CONFIG_PARAVIRT
148void apply_paravirt(struct paravirt_patch_site *start,
149 struct paravirt_patch_site *end);
3#else 150#else
4# include "alternative_64.h" 151static inline void
152apply_paravirt(struct paravirt_patch_site *start,
153 struct paravirt_patch_site *end)
154{}
155#define __parainstructions NULL
156#define __parainstructions_end NULL
5#endif 157#endif
158
159extern void text_poke(void *addr, unsigned char *opcode, int len);
160
161#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/include/asm-x86/alternative_32.h b/include/asm-x86/alternative_32.h
deleted file mode 100644
index bda6c810c0f4..000000000000
--- a/include/asm-x86/alternative_32.h
+++ /dev/null
@@ -1,154 +0,0 @@
1#ifndef _I386_ALTERNATIVE_H
2#define _I386_ALTERNATIVE_H
3
4#include <asm/types.h>
5#include <linux/stddef.h>
6#include <linux/types.h>
7
8struct alt_instr {
9 u8 *instr; /* original instruction */
10 u8 *replacement;
11 u8 cpuid; /* cpuid bit set for replacement */
12 u8 instrlen; /* length of original instruction */
13 u8 replacementlen; /* length of new instruction, <= instrlen */
14 u8 pad;
15};
16
17extern void alternative_instructions(void);
18extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
19
20struct module;
21#ifdef CONFIG_SMP
22extern void alternatives_smp_module_add(struct module *mod, char *name,
23 void *locks, void *locks_end,
24 void *text, void *text_end);
25extern void alternatives_smp_module_del(struct module *mod);
26extern void alternatives_smp_switch(int smp);
27#else
28static inline void alternatives_smp_module_add(struct module *mod, char *name,
29 void *locks, void *locks_end,
30 void *text, void *text_end) {}
31static inline void alternatives_smp_module_del(struct module *mod) {}
32static inline void alternatives_smp_switch(int smp) {}
33#endif /* CONFIG_SMP */
34
35/*
36 * Alternative instructions for different CPU types or capabilities.
37 *
38 * This allows to use optimized instructions even on generic binary
39 * kernels.
40 *
41 * length of oldinstr must be longer or equal the length of newinstr
42 * It can be padded with nops as needed.
43 *
44 * For non barrier like inlines please define new variants
45 * without volatile and memory clobber.
46 */
47#define alternative(oldinstr, newinstr, feature) \
48 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
49 ".section .altinstructions,\"a\"\n" \
50 " .align 4\n" \
51 " .long 661b\n" /* label */ \
52 " .long 663f\n" /* new instruction */ \
53 " .byte %c0\n" /* feature bit */ \
54 " .byte 662b-661b\n" /* sourcelen */ \
55 " .byte 664f-663f\n" /* replacementlen */ \
56 ".previous\n" \
57 ".section .altinstr_replacement,\"ax\"\n" \
58 "663:\n\t" newinstr "\n664:\n" /* replacement */\
59 ".previous" :: "i" (feature) : "memory")
60
61/*
62 * Alternative inline assembly with input.
63 *
64 * Pecularities:
65 * No memory clobber here.
66 * Argument numbers start with 1.
67 * Best is to use constraints that are fixed size (like (%1) ... "r")
68 * If you use variable sized constraints like "m" or "g" in the
69 * replacement maake sure to pad to the worst case length.
70 */
71#define alternative_input(oldinstr, newinstr, feature, input...) \
72 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
73 ".section .altinstructions,\"a\"\n" \
74 " .align 4\n" \
75 " .long 661b\n" /* label */ \
76 " .long 663f\n" /* new instruction */ \
77 " .byte %c0\n" /* feature bit */ \
78 " .byte 662b-661b\n" /* sourcelen */ \
79 " .byte 664f-663f\n" /* replacementlen */ \
80 ".previous\n" \
81 ".section .altinstr_replacement,\"ax\"\n" \
82 "663:\n\t" newinstr "\n664:\n" /* replacement */\
83 ".previous" :: "i" (feature), ##input)
84
85/* Like alternative_input, but with a single output argument */
86#define alternative_io(oldinstr, newinstr, feature, output, input...) \
87 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
88 ".section .altinstructions,\"a\"\n" \
89 " .align 4\n" \
90 " .long 661b\n" /* label */ \
91 " .long 663f\n" /* new instruction */ \
92 " .byte %c[feat]\n" /* feature bit */ \
93 " .byte 662b-661b\n" /* sourcelen */ \
94 " .byte 664f-663f\n" /* replacementlen */ \
95 ".previous\n" \
96 ".section .altinstr_replacement,\"ax\"\n" \
97 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98 ".previous" : output : [feat] "i" (feature), ##input)
99
100/*
101 * use this macro(s) if you need more than one output parameter
102 * in alternative_io
103 */
104#define ASM_OUTPUT2(a, b) a, b
105
106/*
107 * Alternative inline assembly for SMP.
108 *
109 * The LOCK_PREFIX macro defined here replaces the LOCK and
110 * LOCK_PREFIX macros used everywhere in the source tree.
111 *
112 * SMP alternatives use the same data structures as the other
113 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
114 * UP system running a SMP kernel. The existing apply_alternatives()
115 * works fine for patching a SMP kernel for UP.
116 *
117 * The SMP alternative tables can be kept after boot and contain both
118 * UP and SMP versions of the instructions to allow switching back to
119 * SMP at runtime, when hotplugging in a new CPU, which is especially
120 * useful in virtualized environments.
121 *
122 * The very common lock prefix is handled as special case in a
123 * separate table which is a pure address list without replacement ptr
124 * and size information. That keeps the table sizes small.
125 */
126
127#ifdef CONFIG_SMP
128#define LOCK_PREFIX \
129 ".section .smp_locks,\"a\"\n" \
130 " .align 4\n" \
131 " .long 661f\n" /* address */ \
132 ".previous\n" \
133 "661:\n\tlock; "
134
135#else /* ! CONFIG_SMP */
136#define LOCK_PREFIX ""
137#endif
138
139struct paravirt_patch_site;
140#ifdef CONFIG_PARAVIRT
141void apply_paravirt(struct paravirt_patch_site *start,
142 struct paravirt_patch_site *end);
143#else
144static inline void
145apply_paravirt(struct paravirt_patch_site *start,
146 struct paravirt_patch_site *end)
147{}
148#define __parainstructions NULL
149#define __parainstructions_end NULL
150#endif
151
152extern void text_poke(void *addr, unsigned char *opcode, int len);
153
154#endif /* _I386_ALTERNATIVE_H */
diff --git a/include/asm-x86/alternative_64.h b/include/asm-x86/alternative_64.h
deleted file mode 100644
index ab161e810151..000000000000
--- a/include/asm-x86/alternative_64.h
+++ /dev/null
@@ -1,159 +0,0 @@
1#ifndef _X86_64_ALTERNATIVE_H
2#define _X86_64_ALTERNATIVE_H
3
4#ifdef __KERNEL__
5
6#include <linux/types.h>
7#include <linux/stddef.h>
8
9/*
10 * Alternative inline assembly for SMP.
11 *
12 * The LOCK_PREFIX macro defined here replaces the LOCK and
13 * LOCK_PREFIX macros used everywhere in the source tree.
14 *
15 * SMP alternatives use the same data structures as the other
16 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
17 * UP system running a SMP kernel. The existing apply_alternatives()
18 * works fine for patching a SMP kernel for UP.
19 *
20 * The SMP alternative tables can be kept after boot and contain both
21 * UP and SMP versions of the instructions to allow switching back to
22 * SMP at runtime, when hotplugging in a new CPU, which is especially
23 * useful in virtualized environments.
24 *
25 * The very common lock prefix is handled as special case in a
26 * separate table which is a pure address list without replacement ptr
27 * and size information. That keeps the table sizes small.
28 */
29
30#ifdef CONFIG_SMP
31#define LOCK_PREFIX \
32 ".section .smp_locks,\"a\"\n" \
33 " .align 8\n" \
34 " .quad 661f\n" /* address */ \
35 ".previous\n" \
36 "661:\n\tlock; "
37
38#else /* ! CONFIG_SMP */
39#define LOCK_PREFIX ""
40#endif
41
42/* This must be included *after* the definition of LOCK_PREFIX */
43#include <asm/cpufeature.h>
44
45struct alt_instr {
46 u8 *instr; /* original instruction */
47 u8 *replacement;
48 u8 cpuid; /* cpuid bit set for replacement */
49 u8 instrlen; /* length of original instruction */
50 u8 replacementlen; /* length of new instruction, <= instrlen */
51 u8 pad[5];
52};
53
54extern void alternative_instructions(void);
55extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
56
57struct module;
58
59#ifdef CONFIG_SMP
60extern void alternatives_smp_module_add(struct module *mod, char *name,
61 void *locks, void *locks_end,
62 void *text, void *text_end);
63extern void alternatives_smp_module_del(struct module *mod);
64extern void alternatives_smp_switch(int smp);
65#else
66static inline void alternatives_smp_module_add(struct module *mod, char *name,
67 void *locks, void *locks_end,
68 void *text, void *text_end) {}
69static inline void alternatives_smp_module_del(struct module *mod) {}
70static inline void alternatives_smp_switch(int smp) {}
71#endif
72
73#endif
74
75/*
76 * Alternative instructions for different CPU types or capabilities.
77 *
78 * This allows to use optimized instructions even on generic binary
79 * kernels.
80 *
81 * length of oldinstr must be longer or equal the length of newinstr
82 * It can be padded with nops as needed.
83 *
84 * For non barrier like inlines please define new variants
85 * without volatile and memory clobber.
86 */
87#define alternative(oldinstr, newinstr, feature) \
88 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89 ".section .altinstructions,\"a\"\n" \
90 " .align 8\n" \
91 " .quad 661b\n" /* label */ \
92 " .quad 663f\n" /* new instruction */ \
93 " .byte %c0\n" /* feature bit */ \
94 " .byte 662b-661b\n" /* sourcelen */ \
95 " .byte 664f-663f\n" /* replacementlen */ \
96 ".previous\n" \
97 ".section .altinstr_replacement,\"ax\"\n" \
98 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
99 ".previous" :: "i" (feature) : "memory")
100
101/*
102 * Alternative inline assembly with input.
103 *
104 * Pecularities:
105 * No memory clobber here.
106 * Argument numbers start with 1.
107 * Best is to use constraints that are fixed size (like (%1) ... "r")
108 * If you use variable sized constraints like "m" or "g" in the
109 * replacement make sure to pad to the worst case length.
110 */
111#define alternative_input(oldinstr, newinstr, feature, input...) \
112 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
113 ".section .altinstructions,\"a\"\n" \
114 " .align 8\n" \
115 " .quad 661b\n" /* label */ \
116 " .quad 663f\n" /* new instruction */ \
117 " .byte %c0\n" /* feature bit */ \
118 " .byte 662b-661b\n" /* sourcelen */ \
119 " .byte 664f-663f\n" /* replacementlen */ \
120 ".previous\n" \
121 ".section .altinstr_replacement,\"ax\"\n" \
122 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
123 ".previous" :: "i" (feature), ##input)
124
125/* Like alternative_input, but with a single output argument */
126#define alternative_io(oldinstr, newinstr, feature, output, input...) \
127 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
128 ".section .altinstructions,\"a\"\n" \
129 " .align 8\n" \
130 " .quad 661b\n" /* label */ \
131 " .quad 663f\n" /* new instruction */ \
132 " .byte %c[feat]\n" /* feature bit */ \
133 " .byte 662b-661b\n" /* sourcelen */ \
134 " .byte 664f-663f\n" /* replacementlen */ \
135 ".previous\n" \
136 ".section .altinstr_replacement,\"ax\"\n" \
137 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
138 ".previous" : output : [feat] "i" (feature), ##input)
139
140/*
141 * use this macro(s) if you need more than one output parameter
142 * in alternative_io
143 */
144#define ASM_OUTPUT2(a, b) a, b
145
146struct paravirt_patch;
147#ifdef CONFIG_PARAVIRT
148void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
149#else
150static inline void
151apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
152{}
153#define __parainstructions NULL
154#define __parainstructions_end NULL
155#endif
156
157extern void text_poke(void *addr, unsigned char *opcode, int len);
158
159#endif /* _X86_64_ALTERNATIVE_H */
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 9fbcc0bd2ac4..bcfc07fd3661 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -1,5 +1,140 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_APIC_H
2# include "apic_32.h" 2#define _ASM_X86_APIC_H
3
4#include <linux/pm.h>
5#include <linux/delay.h>
6#include <asm/fixmap.h>
7#include <asm/apicdef.h>
8#include <asm/processor.h>
9#include <asm/system.h>
10
11#define ARCH_APICTIMER_STOPS_ON_C3 1
12
13#define Dprintk(x...)
14
15/*
16 * Debugging macros
17 */
18#define APIC_QUIET 0
19#define APIC_VERBOSE 1
20#define APIC_DEBUG 2
21
22/*
23 * Define the default level of output to be very little
24 * This can be turned up by using apic=verbose for more
25 * information and apic=debug for _lots_ of information.
26 * apic_verbosity is defined in apic.c
27 */
28#define apic_printk(v, s, a...) do { \
29 if ((v) <= apic_verbosity) \
30 printk(s, ##a); \
31 } while (0)
32
33
34extern void generic_apic_probe(void);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37
38extern int apic_verbosity;
39extern int timer_over_8254;
40extern int local_apic_timer_c2_ok;
41extern int local_apic_timer_disabled;
42
43extern int apic_runs_main_timer;
44extern int ioapic_force;
45extern int disable_apic;
46extern int disable_apic_timer;
47extern unsigned boot_cpu_id;
48
49/*
50 * Basic functions accessing APICs.
51 */
52#ifdef CONFIG_PARAVIRT
53#include <asm/paravirt.h>
3#else 54#else
4# include "apic_64.h" 55#define apic_write native_apic_write
56#define apic_write_atomic native_apic_write_atomic
57#define apic_read native_apic_read
58#define setup_boot_clock setup_boot_APIC_clock
59#define setup_secondary_clock setup_secondary_APIC_clock
5#endif 60#endif
61
62static inline void native_apic_write(unsigned long reg, u32 v)
63{
64 *((volatile u32 *)(APIC_BASE + reg)) = v;
65}
66
67static inline void native_apic_write_atomic(unsigned long reg, u32 v)
68{
69 (void) xchg((u32*)(APIC_BASE + reg), v);
70}
71
72static inline u32 native_apic_read(unsigned long reg)
73{
74 return *((volatile u32 *)(APIC_BASE + reg));
75}
76
77extern void apic_wait_icr_idle(void);
78extern u32 safe_apic_wait_icr_idle(void);
79extern int get_physical_broadcast(void);
80
81#ifdef CONFIG_X86_GOOD_APIC
82# define FORCE_READ_AROUND_WRITE 0
83# define apic_read_around(x)
84# define apic_write_around(x, y) apic_write((x), (y))
85#else
86# define FORCE_READ_AROUND_WRITE 1
87# define apic_read_around(x) apic_read(x)
88# define apic_write_around(x, y) apic_write_atomic((x), (y))
89#endif
90
91static inline void ack_APIC_irq(void)
92{
93 /*
94 * ack_APIC_irq() actually gets compiled as a single instruction:
95 * - a single rmw on Pentium/82489DX
96 * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
97 * ... yummie.
98 */
99
100 /* Docs say use 0 for future compatibility */
101 apic_write_around(APIC_EOI, 0);
102}
103
104extern int lapic_get_maxlvt(void);
105extern void clear_local_APIC(void);
106extern void connect_bsp_APIC(void);
107extern void disconnect_bsp_APIC(int virt_wire_setup);
108extern void disable_local_APIC(void);
109extern void lapic_shutdown(void);
110extern int verify_local_APIC(void);
111extern void cache_APIC_registers(void);
112extern void sync_Arb_IDs(void);
113extern void init_bsp_APIC(void);
114extern void setup_local_APIC(void);
115extern void end_local_APIC_setup(void);
116extern void init_apic_mappings(void);
117extern void setup_boot_APIC_clock(void);
118extern void setup_secondary_APIC_clock(void);
119extern int APIC_init_uniprocessor(void);
120extern void enable_NMI_through_LVT0(void);
121
122/*
123 * On 32bit this is mach-xxx local
124 */
125#ifdef CONFIG_X86_64
126extern void setup_apic_routing(void);
127#endif
128
129extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
130extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
131
132extern int apic_is_clustered_box(void);
133
134#else /* !CONFIG_X86_LOCAL_APIC */
135static inline void lapic_shutdown(void) { }
136#define local_apic_timer_c2_ok 1
137
138#endif /* !CONFIG_X86_LOCAL_APIC */
139
140#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apic_32.h b/include/asm-x86/apic_32.h
deleted file mode 100644
index be158b27d54b..000000000000
--- a/include/asm-x86/apic_32.h
+++ /dev/null
@@ -1,127 +0,0 @@
1#ifndef __ASM_APIC_H
2#define __ASM_APIC_H
3
4#include <linux/pm.h>
5#include <linux/delay.h>
6#include <asm/fixmap.h>
7#include <asm/apicdef.h>
8#include <asm/processor.h>
9#include <asm/system.h>
10
11#define Dprintk(x...)
12
13/*
14 * Debugging macros
15 */
16#define APIC_QUIET 0
17#define APIC_VERBOSE 1
18#define APIC_DEBUG 2
19
20extern int apic_verbosity;
21
22/*
23 * Define the default level of output to be very little
24 * This can be turned up by using apic=verbose for more
25 * information and apic=debug for _lots_ of information.
26 * apic_verbosity is defined in apic.c
27 */
28#define apic_printk(v, s, a...) do { \
29 if ((v) <= apic_verbosity) \
30 printk(s, ##a); \
31 } while (0)
32
33
34extern void generic_apic_probe(void);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37
38/*
39 * Basic functions accessing APICs.
40 */
41#ifdef CONFIG_PARAVIRT
42#include <asm/paravirt.h>
43#else
44#define apic_write native_apic_write
45#define apic_write_atomic native_apic_write_atomic
46#define apic_read native_apic_read
47#define setup_boot_clock setup_boot_APIC_clock
48#define setup_secondary_clock setup_secondary_APIC_clock
49#endif
50
51static __inline fastcall void native_apic_write(unsigned long reg,
52 unsigned long v)
53{
54 *((volatile unsigned long *)(APIC_BASE+reg)) = v;
55}
56
57static __inline fastcall void native_apic_write_atomic(unsigned long reg,
58 unsigned long v)
59{
60 xchg((volatile unsigned long *)(APIC_BASE+reg), v);
61}
62
63static __inline fastcall unsigned long native_apic_read(unsigned long reg)
64{
65 return *((volatile unsigned long *)(APIC_BASE+reg));
66}
67
68void apic_wait_icr_idle(void);
69unsigned long safe_apic_wait_icr_idle(void);
70int get_physical_broadcast(void);
71
72#ifdef CONFIG_X86_GOOD_APIC
73# define FORCE_READ_AROUND_WRITE 0
74# define apic_read_around(x)
75# define apic_write_around(x,y) apic_write((x),(y))
76#else
77# define FORCE_READ_AROUND_WRITE 1
78# define apic_read_around(x) apic_read(x)
79# define apic_write_around(x,y) apic_write_atomic((x),(y))
80#endif
81
82static inline void ack_APIC_irq(void)
83{
84 /*
85 * ack_APIC_irq() actually gets compiled as a single instruction:
86 * - a single rmw on Pentium/82489DX
87 * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
88 * ... yummie.
89 */
90
91 /* Docs say use 0 for future compatibility */
92 apic_write_around(APIC_EOI, 0);
93}
94
95extern int lapic_get_maxlvt(void);
96extern void clear_local_APIC(void);
97extern void connect_bsp_APIC (void);
98extern void disconnect_bsp_APIC (int virt_wire_setup);
99extern void disable_local_APIC (void);
100extern void lapic_shutdown (void);
101extern int verify_local_APIC (void);
102extern void cache_APIC_registers (void);
103extern void sync_Arb_IDs (void);
104extern void init_bsp_APIC (void);
105extern void setup_local_APIC (void);
106extern void init_apic_mappings (void);
107extern void smp_local_timer_interrupt (void);
108extern void setup_boot_APIC_clock (void);
109extern void setup_secondary_APIC_clock (void);
110extern int APIC_init_uniprocessor (void);
111
112extern void enable_NMI_through_LVT0 (void * dummy);
113
114#define ARCH_APICTIMER_STOPS_ON_C3 1
115
116extern int timer_over_8254;
117extern int local_apic_timer_c2_ok;
118
119extern int local_apic_timer_disabled;
120
121#else /* !CONFIG_X86_LOCAL_APIC */
122static inline void lapic_shutdown(void) { }
123#define local_apic_timer_c2_ok 1
124
125#endif /* !CONFIG_X86_LOCAL_APIC */
126
127#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apic_64.h b/include/asm-x86/apic_64.h
deleted file mode 100644
index 2747a11a2b19..000000000000
--- a/include/asm-x86/apic_64.h
+++ /dev/null
@@ -1,102 +0,0 @@
1#ifndef __ASM_APIC_H
2#define __ASM_APIC_H
3
4#include <linux/pm.h>
5#include <linux/delay.h>
6#include <asm/fixmap.h>
7#include <asm/apicdef.h>
8#include <asm/system.h>
9
10#define Dprintk(x...)
11
12/*
13 * Debugging macros
14 */
15#define APIC_QUIET 0
16#define APIC_VERBOSE 1
17#define APIC_DEBUG 2
18
19extern int apic_verbosity;
20extern int apic_runs_main_timer;
21extern int ioapic_force;
22extern int disable_apic_timer;
23
24/*
25 * Define the default level of output to be very little
26 * This can be turned up by using apic=verbose for more
27 * information and apic=debug for _lots_ of information.
28 * apic_verbosity is defined in apic.c
29 */
30#define apic_printk(v, s, a...) do { \
31 if ((v) <= apic_verbosity) \
32 printk(s, ##a); \
33 } while (0)
34
35struct pt_regs;
36
37/*
38 * Basic functions accessing APICs.
39 */
40
41static __inline void apic_write(unsigned long reg, unsigned int v)
42{
43 *((volatile unsigned int *)(APIC_BASE+reg)) = v;
44}
45
46static __inline unsigned int apic_read(unsigned long reg)
47{
48 return *((volatile unsigned int *)(APIC_BASE+reg));
49}
50
51extern void apic_wait_icr_idle(void);
52extern unsigned int safe_apic_wait_icr_idle(void);
53
54static inline void ack_APIC_irq(void)
55{
56 /*
57 * ack_APIC_irq() actually gets compiled as a single instruction:
58 * - a single rmw on Pentium/82489DX
59 * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
60 * ... yummie.
61 */
62
63 /* Docs say use 0 for future compatibility */
64 apic_write(APIC_EOI, 0);
65}
66
67extern int get_maxlvt (void);
68extern void clear_local_APIC (void);
69extern void connect_bsp_APIC (void);
70extern void disconnect_bsp_APIC (int virt_wire_setup);
71extern void disable_local_APIC (void);
72extern void lapic_shutdown (void);
73extern int verify_local_APIC (void);
74extern void cache_APIC_registers (void);
75extern void sync_Arb_IDs (void);
76extern void init_bsp_APIC (void);
77extern void setup_local_APIC (void);
78extern void init_apic_mappings (void);
79extern void smp_local_timer_interrupt (void);
80extern void setup_boot_APIC_clock (void);
81extern void setup_secondary_APIC_clock (void);
82extern int APIC_init_uniprocessor (void);
83extern void setup_apic_routing(void);
84
85extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
86 unsigned char msg_type, unsigned char mask);
87
88extern int apic_is_clustered_box(void);
89
90#define K8_APIC_EXT_LVT_BASE 0x500
91#define K8_APIC_EXT_INT_MSG_FIX 0x0
92#define K8_APIC_EXT_INT_MSG_SMI 0x2
93#define K8_APIC_EXT_INT_MSG_NMI 0x4
94#define K8_APIC_EXT_INT_MSG_EXT 0x7
95#define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0
96
97#define ARCH_APICTIMER_STOPS_ON_C3 1
98
99extern unsigned boot_cpu_id;
100extern int local_apic_timer_c2_ok;
101
102#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apicdef.h b/include/asm-x86/apicdef.h
index 4542c220bf4d..550af7a6f88e 100644
--- a/include/asm-x86/apicdef.h
+++ b/include/asm-x86/apicdef.h
@@ -1,5 +1,413 @@
1#ifndef _ASM_X86_APICDEF_H
2#define _ASM_X86_APICDEF_H
3
4/*
5 * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
6 *
7 * Alan Cox <Alan.Cox@linux.org>, 1995.
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */
10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12
13#define APIC_ID 0x20
14
15#ifdef CONFIG_X86_64
16# define APIC_ID_MASK (0xFFu<<24)
17# define GET_APIC_ID(x) (((x)>>24)&0xFFu)
18# define SET_APIC_ID(x) (((x)<<24))
19#endif
20
21#define APIC_LVR 0x30
22#define APIC_LVR_MASK 0xFF00FF
23#define GET_APIC_VERSION(x) ((x)&0xFFu)
24#define GET_APIC_MAXLVT(x) (((x)>>16)&0xFFu)
25#define APIC_INTEGRATED(x) ((x)&0xF0u)
26#define APIC_XAPIC(x) ((x) >= 0x14)
27#define APIC_TASKPRI 0x80
28#define APIC_TPRI_MASK 0xFFu
29#define APIC_ARBPRI 0x90
30#define APIC_ARBPRI_MASK 0xFFu
31#define APIC_PROCPRI 0xA0
32#define APIC_EOI 0xB0
33#define APIC_EIO_ACK 0x0
34#define APIC_RRR 0xC0
35#define APIC_LDR 0xD0
36#define APIC_LDR_MASK (0xFFu<<24)
37#define GET_APIC_LOGICAL_ID(x) (((x)>>24)&0xFFu)
38#define SET_APIC_LOGICAL_ID(x) (((x)<<24))
39#define APIC_ALL_CPUS 0xFFu
40#define APIC_DFR 0xE0
41#define APIC_DFR_CLUSTER 0x0FFFFFFFul
42#define APIC_DFR_FLAT 0xFFFFFFFFul
43#define APIC_SPIV 0xF0
44#define APIC_SPIV_FOCUS_DISABLED (1<<9)
45#define APIC_SPIV_APIC_ENABLED (1<<8)
46#define APIC_ISR 0x100
47#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
48#define APIC_TMR 0x180
49#define APIC_IRR 0x200
50#define APIC_ESR 0x280
51#define APIC_ESR_SEND_CS 0x00001
52#define APIC_ESR_RECV_CS 0x00002
53#define APIC_ESR_SEND_ACC 0x00004
54#define APIC_ESR_RECV_ACC 0x00008
55#define APIC_ESR_SENDILL 0x00020
56#define APIC_ESR_RECVILL 0x00040
57#define APIC_ESR_ILLREGA 0x00080
58#define APIC_ICR 0x300
59#define APIC_DEST_SELF 0x40000
60#define APIC_DEST_ALLINC 0x80000
61#define APIC_DEST_ALLBUT 0xC0000
62#define APIC_ICR_RR_MASK 0x30000
63#define APIC_ICR_RR_INVALID 0x00000
64#define APIC_ICR_RR_INPROG 0x10000
65#define APIC_ICR_RR_VALID 0x20000
66#define APIC_INT_LEVELTRIG 0x08000
67#define APIC_INT_ASSERT 0x04000
68#define APIC_ICR_BUSY 0x01000
69#define APIC_DEST_LOGICAL 0x00800
70#define APIC_DEST_PHYSICAL 0x00000
71#define APIC_DM_FIXED 0x00000
72#define APIC_DM_LOWEST 0x00100
73#define APIC_DM_SMI 0x00200
74#define APIC_DM_REMRD 0x00300
75#define APIC_DM_NMI 0x00400
76#define APIC_DM_INIT 0x00500
77#define APIC_DM_STARTUP 0x00600
78#define APIC_DM_EXTINT 0x00700
79#define APIC_VECTOR_MASK 0x000FF
80#define APIC_ICR2 0x310
81#define GET_APIC_DEST_FIELD(x) (((x)>>24)&0xFF)
82#define SET_APIC_DEST_FIELD(x) ((x)<<24)
83#define APIC_LVTT 0x320
84#define APIC_LVTTHMR 0x330
85#define APIC_LVTPC 0x340
86#define APIC_LVT0 0x350
87#define APIC_LVT_TIMER_BASE_MASK (0x3<<18)
88#define GET_APIC_TIMER_BASE(x) (((x)>>18)&0x3)
89#define SET_APIC_TIMER_BASE(x) (((x)<<18))
90#define APIC_TIMER_BASE_CLKIN 0x0
91#define APIC_TIMER_BASE_TMBASE 0x1
92#define APIC_TIMER_BASE_DIV 0x2
93#define APIC_LVT_TIMER_PERIODIC (1<<17)
94#define APIC_LVT_MASKED (1<<16)
95#define APIC_LVT_LEVEL_TRIGGER (1<<15)
96#define APIC_LVT_REMOTE_IRR (1<<14)
97#define APIC_INPUT_POLARITY (1<<13)
98#define APIC_SEND_PENDING (1<<12)
99#define APIC_MODE_MASK 0x700
100#define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
101#define SET_APIC_DELIVERY_MODE(x, y) (((x)&~0x700)|((y)<<8))
102#define APIC_MODE_FIXED 0x0
103#define APIC_MODE_NMI 0x4
104#define APIC_MODE_EXTINT 0x7
105#define APIC_LVT1 0x360
106#define APIC_LVTERR 0x370
107#define APIC_TMICT 0x380
108#define APIC_TMCCT 0x390
109#define APIC_TDCR 0x3E0
110#define APIC_TDR_DIV_TMBASE (1<<2)
111#define APIC_TDR_DIV_1 0xB
112#define APIC_TDR_DIV_2 0x0
113#define APIC_TDR_DIV_4 0x1
114#define APIC_TDR_DIV_8 0x2
115#define APIC_TDR_DIV_16 0x3
116#define APIC_TDR_DIV_32 0x8
117#define APIC_TDR_DIV_64 0x9
118#define APIC_TDR_DIV_128 0xA
119#define APIC_EILVT0 0x500
120#define APIC_EILVT_NR_AMD_K8 1 /* Number of extended interrupts */
121#define APIC_EILVT_NR_AMD_10H 4
122#define APIC_EILVT_LVTOFF(x) (((x)>>4)&0xF)
123#define APIC_EILVT_MSG_FIX 0x0
124#define APIC_EILVT_MSG_SMI 0x2
125#define APIC_EILVT_MSG_NMI 0x4
126#define APIC_EILVT_MSG_EXT 0x7
127#define APIC_EILVT_MASKED (1<<16)
128#define APIC_EILVT1 0x510
129#define APIC_EILVT2 0x520
130#define APIC_EILVT3 0x530
131
132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
133
1#ifdef CONFIG_X86_32 134#ifdef CONFIG_X86_32
2# include "apicdef_32.h" 135# define MAX_IO_APICS 64
3#else 136#else
4# include "apicdef_64.h" 137# define MAX_IO_APICS 128
138# define MAX_LOCAL_APIC 256
139#endif
140
141/*
142 * All x86-64 systems are xAPIC compatible.
143 * In the following, "apicid" is a physical APIC ID.
144 */
145#define XAPIC_DEST_CPUS_SHIFT 4
146#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
147#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
148#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
149#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
150#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
151#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
152
153/*
154 * the local APIC register structure, memory mapped. Not terribly well
155 * tested, but we might eventually use this one in the future - the
156 * problem why we cannot use it right now is the P5 APIC, it has an
157 * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
158 */
159#define u32 unsigned int
160
161struct local_apic {
162
163/*000*/ struct { u32 __reserved[4]; } __reserved_01;
164
165/*010*/ struct { u32 __reserved[4]; } __reserved_02;
166
167/*020*/ struct { /* APIC ID Register */
168 u32 __reserved_1 : 24,
169 phys_apic_id : 4,
170 __reserved_2 : 4;
171 u32 __reserved[3];
172 } id;
173
174/*030*/ const
175 struct { /* APIC Version Register */
176 u32 version : 8,
177 __reserved_1 : 8,
178 max_lvt : 8,
179 __reserved_2 : 8;
180 u32 __reserved[3];
181 } version;
182
183/*040*/ struct { u32 __reserved[4]; } __reserved_03;
184
185/*050*/ struct { u32 __reserved[4]; } __reserved_04;
186
187/*060*/ struct { u32 __reserved[4]; } __reserved_05;
188
189/*070*/ struct { u32 __reserved[4]; } __reserved_06;
190
191/*080*/ struct { /* Task Priority Register */
192 u32 priority : 8,
193 __reserved_1 : 24;
194 u32 __reserved_2[3];
195 } tpr;
196
197/*090*/ const
198 struct { /* Arbitration Priority Register */
199 u32 priority : 8,
200 __reserved_1 : 24;
201 u32 __reserved_2[3];
202 } apr;
203
204/*0A0*/ const
205 struct { /* Processor Priority Register */
206 u32 priority : 8,
207 __reserved_1 : 24;
208 u32 __reserved_2[3];
209 } ppr;
210
211/*0B0*/ struct { /* End Of Interrupt Register */
212 u32 eoi;
213 u32 __reserved[3];
214 } eoi;
215
216/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
217
218/*0D0*/ struct { /* Logical Destination Register */
219 u32 __reserved_1 : 24,
220 logical_dest : 8;
221 u32 __reserved_2[3];
222 } ldr;
223
224/*0E0*/ struct { /* Destination Format Register */
225 u32 __reserved_1 : 28,
226 model : 4;
227 u32 __reserved_2[3];
228 } dfr;
229
230/*0F0*/ struct { /* Spurious Interrupt Vector Register */
231 u32 spurious_vector : 8,
232 apic_enabled : 1,
233 focus_cpu : 1,
234 __reserved_2 : 22;
235 u32 __reserved_3[3];
236 } svr;
237
238/*100*/ struct { /* In Service Register */
239/*170*/ u32 bitfield;
240 u32 __reserved[3];
241 } isr [8];
242
243/*180*/ struct { /* Trigger Mode Register */
244/*1F0*/ u32 bitfield;
245 u32 __reserved[3];
246 } tmr [8];
247
248/*200*/ struct { /* Interrupt Request Register */
249/*270*/ u32 bitfield;
250 u32 __reserved[3];
251 } irr [8];
252
253/*280*/ union { /* Error Status Register */
254 struct {
255 u32 send_cs_error : 1,
256 receive_cs_error : 1,
257 send_accept_error : 1,
258 receive_accept_error : 1,
259 __reserved_1 : 1,
260 send_illegal_vector : 1,
261 receive_illegal_vector : 1,
262 illegal_register_address : 1,
263 __reserved_2 : 24;
264 u32 __reserved_3[3];
265 } error_bits;
266 struct {
267 u32 errors;
268 u32 __reserved_3[3];
269 } all_errors;
270 } esr;
271
272/*290*/ struct { u32 __reserved[4]; } __reserved_08;
273
274/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
275
276/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
277
278/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
279
280/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
281
282/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
283
284/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
285
286/*300*/ struct { /* Interrupt Command Register 1 */
287 u32 vector : 8,
288 delivery_mode : 3,
289 destination_mode : 1,
290 delivery_status : 1,
291 __reserved_1 : 1,
292 level : 1,
293 trigger : 1,
294 __reserved_2 : 2,
295 shorthand : 2,
296 __reserved_3 : 12;
297 u32 __reserved_4[3];
298 } icr1;
299
300/*310*/ struct { /* Interrupt Command Register 2 */
301 union {
302 u32 __reserved_1 : 24,
303 phys_dest : 4,
304 __reserved_2 : 4;
305 u32 __reserved_3 : 24,
306 logical_dest : 8;
307 } dest;
308 u32 __reserved_4[3];
309 } icr2;
310
311/*320*/ struct { /* LVT - Timer */
312 u32 vector : 8,
313 __reserved_1 : 4,
314 delivery_status : 1,
315 __reserved_2 : 3,
316 mask : 1,
317 timer_mode : 1,
318 __reserved_3 : 14;
319 u32 __reserved_4[3];
320 } lvt_timer;
321
322/*330*/ struct { /* LVT - Thermal Sensor */
323 u32 vector : 8,
324 delivery_mode : 3,
325 __reserved_1 : 1,
326 delivery_status : 1,
327 __reserved_2 : 3,
328 mask : 1,
329 __reserved_3 : 15;
330 u32 __reserved_4[3];
331 } lvt_thermal;
332
333/*340*/ struct { /* LVT - Performance Counter */
334 u32 vector : 8,
335 delivery_mode : 3,
336 __reserved_1 : 1,
337 delivery_status : 1,
338 __reserved_2 : 3,
339 mask : 1,
340 __reserved_3 : 15;
341 u32 __reserved_4[3];
342 } lvt_pc;
343
344/*350*/ struct { /* LVT - LINT0 */
345 u32 vector : 8,
346 delivery_mode : 3,
347 __reserved_1 : 1,
348 delivery_status : 1,
349 polarity : 1,
350 remote_irr : 1,
351 trigger : 1,
352 mask : 1,
353 __reserved_2 : 15;
354 u32 __reserved_3[3];
355 } lvt_lint0;
356
357/*360*/ struct { /* LVT - LINT1 */
358 u32 vector : 8,
359 delivery_mode : 3,
360 __reserved_1 : 1,
361 delivery_status : 1,
362 polarity : 1,
363 remote_irr : 1,
364 trigger : 1,
365 mask : 1,
366 __reserved_2 : 15;
367 u32 __reserved_3[3];
368 } lvt_lint1;
369
370/*370*/ struct { /* LVT - Error */
371 u32 vector : 8,
372 __reserved_1 : 4,
373 delivery_status : 1,
374 __reserved_2 : 3,
375 mask : 1,
376 __reserved_3 : 15;
377 u32 __reserved_4[3];
378 } lvt_error;
379
380/*380*/ struct { /* Timer Initial Count Register */
381 u32 initial_count;
382 u32 __reserved_2[3];
383 } timer_icr;
384
385/*390*/ const
386 struct { /* Timer Current Count Register */
387 u32 curr_count;
388 u32 __reserved_2[3];
389 } timer_ccr;
390
391/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
392
393/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
394
395/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
396
397/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
398
399/*3E0*/ struct { /* Timer Divide Configuration Register */
400 u32 divisor : 4,
401 __reserved_1 : 28;
402 u32 __reserved_2[3];
403 } timer_dcr;
404
405/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
406
407} __attribute__ ((packed));
408
409#undef u32
410
411#define BAD_APICID 0xFFu
412
5#endif 413#endif
diff --git a/include/asm-x86/apicdef_32.h b/include/asm-x86/apicdef_32.h
deleted file mode 100644
index 9f6995341fdc..000000000000
--- a/include/asm-x86/apicdef_32.h
+++ /dev/null
@@ -1,375 +0,0 @@
1#ifndef __ASM_APICDEF_H
2#define __ASM_APICDEF_H
3
4/*
5 * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
6 *
7 * Alan Cox <Alan.Cox@linux.org>, 1995.
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */
10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12
13#define APIC_ID 0x20
14#define APIC_LVR 0x30
15#define APIC_LVR_MASK 0xFF00FF
16#define GET_APIC_VERSION(x) ((x)&0xFF)
17#define GET_APIC_MAXLVT(x) (((x)>>16)&0xFF)
18#define APIC_INTEGRATED(x) ((x)&0xF0)
19#define APIC_XAPIC(x) ((x) >= 0x14)
20#define APIC_TASKPRI 0x80
21#define APIC_TPRI_MASK 0xFF
22#define APIC_ARBPRI 0x90
23#define APIC_ARBPRI_MASK 0xFF
24#define APIC_PROCPRI 0xA0
25#define APIC_EOI 0xB0
26#define APIC_EIO_ACK 0x0 /* Write this to the EOI register */
27#define APIC_RRR 0xC0
28#define APIC_LDR 0xD0
29#define APIC_LDR_MASK (0xFF<<24)
30#define GET_APIC_LOGICAL_ID(x) (((x)>>24)&0xFF)
31#define SET_APIC_LOGICAL_ID(x) (((x)<<24))
32#define APIC_ALL_CPUS 0xFF
33#define APIC_DFR 0xE0
34#define APIC_DFR_CLUSTER 0x0FFFFFFFul
35#define APIC_DFR_FLAT 0xFFFFFFFFul
36#define APIC_SPIV 0xF0
37#define APIC_SPIV_FOCUS_DISABLED (1<<9)
38#define APIC_SPIV_APIC_ENABLED (1<<8)
39#define APIC_ISR 0x100
40#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
41#define APIC_TMR 0x180
42#define APIC_IRR 0x200
43#define APIC_ESR 0x280
44#define APIC_ESR_SEND_CS 0x00001
45#define APIC_ESR_RECV_CS 0x00002
46#define APIC_ESR_SEND_ACC 0x00004
47#define APIC_ESR_RECV_ACC 0x00008
48#define APIC_ESR_SENDILL 0x00020
49#define APIC_ESR_RECVILL 0x00040
50#define APIC_ESR_ILLREGA 0x00080
51#define APIC_ICR 0x300
52#define APIC_DEST_SELF 0x40000
53#define APIC_DEST_ALLINC 0x80000
54#define APIC_DEST_ALLBUT 0xC0000
55#define APIC_ICR_RR_MASK 0x30000
56#define APIC_ICR_RR_INVALID 0x00000
57#define APIC_ICR_RR_INPROG 0x10000
58#define APIC_ICR_RR_VALID 0x20000
59#define APIC_INT_LEVELTRIG 0x08000
60#define APIC_INT_ASSERT 0x04000
61#define APIC_ICR_BUSY 0x01000
62#define APIC_DEST_LOGICAL 0x00800
63#define APIC_DM_FIXED 0x00000
64#define APIC_DM_LOWEST 0x00100
65#define APIC_DM_SMI 0x00200
66#define APIC_DM_REMRD 0x00300
67#define APIC_DM_NMI 0x00400
68#define APIC_DM_INIT 0x00500
69#define APIC_DM_STARTUP 0x00600
70#define APIC_DM_EXTINT 0x00700
71#define APIC_VECTOR_MASK 0x000FF
72#define APIC_ICR2 0x310
73#define GET_APIC_DEST_FIELD(x) (((x)>>24)&0xFF)
74#define SET_APIC_DEST_FIELD(x) ((x)<<24)
75#define APIC_LVTT 0x320
76#define APIC_LVTTHMR 0x330
77#define APIC_LVTPC 0x340
78#define APIC_LVT0 0x350
79#define APIC_LVT_TIMER_BASE_MASK (0x3<<18)
80#define GET_APIC_TIMER_BASE(x) (((x)>>18)&0x3)
81#define SET_APIC_TIMER_BASE(x) (((x)<<18))
82#define APIC_TIMER_BASE_CLKIN 0x0
83#define APIC_TIMER_BASE_TMBASE 0x1
84#define APIC_TIMER_BASE_DIV 0x2
85#define APIC_LVT_TIMER_PERIODIC (1<<17)
86#define APIC_LVT_MASKED (1<<16)
87#define APIC_LVT_LEVEL_TRIGGER (1<<15)
88#define APIC_LVT_REMOTE_IRR (1<<14)
89#define APIC_INPUT_POLARITY (1<<13)
90#define APIC_SEND_PENDING (1<<12)
91#define APIC_MODE_MASK 0x700
92#define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
93#define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
94#define APIC_MODE_FIXED 0x0
95#define APIC_MODE_NMI 0x4
96#define APIC_MODE_EXTINT 0x7
97#define APIC_LVT1 0x360
98#define APIC_LVTERR 0x370
99#define APIC_TMICT 0x380
100#define APIC_TMCCT 0x390
101#define APIC_TDCR 0x3E0
102#define APIC_TDR_DIV_TMBASE (1<<2)
103#define APIC_TDR_DIV_1 0xB
104#define APIC_TDR_DIV_2 0x0
105#define APIC_TDR_DIV_4 0x1
106#define APIC_TDR_DIV_8 0x2
107#define APIC_TDR_DIV_16 0x3
108#define APIC_TDR_DIV_32 0x8
109#define APIC_TDR_DIV_64 0x9
110#define APIC_TDR_DIV_128 0xA
111
112#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
113
114#define MAX_IO_APICS 64
115
116/*
117 * the local APIC register structure, memory mapped. Not terribly well
118 * tested, but we might eventually use this one in the future - the
119 * problem why we cannot use it right now is the P5 APIC, it has an
120 * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
121 */
122#define u32 unsigned int
123
124
125struct local_apic {
126
127/*000*/ struct { u32 __reserved[4]; } __reserved_01;
128
129/*010*/ struct { u32 __reserved[4]; } __reserved_02;
130
131/*020*/ struct { /* APIC ID Register */
132 u32 __reserved_1 : 24,
133 phys_apic_id : 4,
134 __reserved_2 : 4;
135 u32 __reserved[3];
136 } id;
137
138/*030*/ const
139 struct { /* APIC Version Register */
140 u32 version : 8,
141 __reserved_1 : 8,
142 max_lvt : 8,
143 __reserved_2 : 8;
144 u32 __reserved[3];
145 } version;
146
147/*040*/ struct { u32 __reserved[4]; } __reserved_03;
148
149/*050*/ struct { u32 __reserved[4]; } __reserved_04;
150
151/*060*/ struct { u32 __reserved[4]; } __reserved_05;
152
153/*070*/ struct { u32 __reserved[4]; } __reserved_06;
154
155/*080*/ struct { /* Task Priority Register */
156 u32 priority : 8,
157 __reserved_1 : 24;
158 u32 __reserved_2[3];
159 } tpr;
160
161/*090*/ const
162 struct { /* Arbitration Priority Register */
163 u32 priority : 8,
164 __reserved_1 : 24;
165 u32 __reserved_2[3];
166 } apr;
167
168/*0A0*/ const
169 struct { /* Processor Priority Register */
170 u32 priority : 8,
171 __reserved_1 : 24;
172 u32 __reserved_2[3];
173 } ppr;
174
175/*0B0*/ struct { /* End Of Interrupt Register */
176 u32 eoi;
177 u32 __reserved[3];
178 } eoi;
179
180/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
181
182/*0D0*/ struct { /* Logical Destination Register */
183 u32 __reserved_1 : 24,
184 logical_dest : 8;
185 u32 __reserved_2[3];
186 } ldr;
187
188/*0E0*/ struct { /* Destination Format Register */
189 u32 __reserved_1 : 28,
190 model : 4;
191 u32 __reserved_2[3];
192 } dfr;
193
194/*0F0*/ struct { /* Spurious Interrupt Vector Register */
195 u32 spurious_vector : 8,
196 apic_enabled : 1,
197 focus_cpu : 1,
198 __reserved_2 : 22;
199 u32 __reserved_3[3];
200 } svr;
201
202/*100*/ struct { /* In Service Register */
203/*170*/ u32 bitfield;
204 u32 __reserved[3];
205 } isr [8];
206
207/*180*/ struct { /* Trigger Mode Register */
208/*1F0*/ u32 bitfield;
209 u32 __reserved[3];
210 } tmr [8];
211
212/*200*/ struct { /* Interrupt Request Register */
213/*270*/ u32 bitfield;
214 u32 __reserved[3];
215 } irr [8];
216
217/*280*/ union { /* Error Status Register */
218 struct {
219 u32 send_cs_error : 1,
220 receive_cs_error : 1,
221 send_accept_error : 1,
222 receive_accept_error : 1,
223 __reserved_1 : 1,
224 send_illegal_vector : 1,
225 receive_illegal_vector : 1,
226 illegal_register_address : 1,
227 __reserved_2 : 24;
228 u32 __reserved_3[3];
229 } error_bits;
230 struct {
231 u32 errors;
232 u32 __reserved_3[3];
233 } all_errors;
234 } esr;
235
236/*290*/ struct { u32 __reserved[4]; } __reserved_08;
237
238/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
239
240/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
241
242/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
243
244/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
245
246/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
247
248/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
249
250/*300*/ struct { /* Interrupt Command Register 1 */
251 u32 vector : 8,
252 delivery_mode : 3,
253 destination_mode : 1,
254 delivery_status : 1,
255 __reserved_1 : 1,
256 level : 1,
257 trigger : 1,
258 __reserved_2 : 2,
259 shorthand : 2,
260 __reserved_3 : 12;
261 u32 __reserved_4[3];
262 } icr1;
263
264/*310*/ struct { /* Interrupt Command Register 2 */
265 union {
266 u32 __reserved_1 : 24,
267 phys_dest : 4,
268 __reserved_2 : 4;
269 u32 __reserved_3 : 24,
270 logical_dest : 8;
271 } dest;
272 u32 __reserved_4[3];
273 } icr2;
274
275/*320*/ struct { /* LVT - Timer */
276 u32 vector : 8,
277 __reserved_1 : 4,
278 delivery_status : 1,
279 __reserved_2 : 3,
280 mask : 1,
281 timer_mode : 1,
282 __reserved_3 : 14;
283 u32 __reserved_4[3];
284 } lvt_timer;
285
286/*330*/ struct { /* LVT - Thermal Sensor */
287 u32 vector : 8,
288 delivery_mode : 3,
289 __reserved_1 : 1,
290 delivery_status : 1,
291 __reserved_2 : 3,
292 mask : 1,
293 __reserved_3 : 15;
294 u32 __reserved_4[3];
295 } lvt_thermal;
296
297/*340*/ struct { /* LVT - Performance Counter */
298 u32 vector : 8,
299 delivery_mode : 3,
300 __reserved_1 : 1,
301 delivery_status : 1,
302 __reserved_2 : 3,
303 mask : 1,
304 __reserved_3 : 15;
305 u32 __reserved_4[3];
306 } lvt_pc;
307
308/*350*/ struct { /* LVT - LINT0 */
309 u32 vector : 8,
310 delivery_mode : 3,
311 __reserved_1 : 1,
312 delivery_status : 1,
313 polarity : 1,
314 remote_irr : 1,
315 trigger : 1,
316 mask : 1,
317 __reserved_2 : 15;
318 u32 __reserved_3[3];
319 } lvt_lint0;
320
321/*360*/ struct { /* LVT - LINT1 */
322 u32 vector : 8,
323 delivery_mode : 3,
324 __reserved_1 : 1,
325 delivery_status : 1,
326 polarity : 1,
327 remote_irr : 1,
328 trigger : 1,
329 mask : 1,
330 __reserved_2 : 15;
331 u32 __reserved_3[3];
332 } lvt_lint1;
333
334/*370*/ struct { /* LVT - Error */
335 u32 vector : 8,
336 __reserved_1 : 4,
337 delivery_status : 1,
338 __reserved_2 : 3,
339 mask : 1,
340 __reserved_3 : 15;
341 u32 __reserved_4[3];
342 } lvt_error;
343
344/*380*/ struct { /* Timer Initial Count Register */
345 u32 initial_count;
346 u32 __reserved_2[3];
347 } timer_icr;
348
349/*390*/ const
350 struct { /* Timer Current Count Register */
351 u32 curr_count;
352 u32 __reserved_2[3];
353 } timer_ccr;
354
355/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
356
357/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
358
359/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
360
361/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
362
363/*3E0*/ struct { /* Timer Divide Configuration Register */
364 u32 divisor : 4,
365 __reserved_1 : 28;
366 u32 __reserved_2[3];
367 } timer_dcr;
368
369/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
370
371} __attribute__ ((packed));
372
373#undef u32
374
375#endif
diff --git a/include/asm-x86/apicdef_64.h b/include/asm-x86/apicdef_64.h
deleted file mode 100644
index 1dd40067c67c..000000000000
--- a/include/asm-x86/apicdef_64.h
+++ /dev/null
@@ -1,392 +0,0 @@
1#ifndef __ASM_APICDEF_H
2#define __ASM_APICDEF_H
3
4/*
5 * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
6 *
7 * Alan Cox <Alan.Cox@linux.org>, 1995.
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */
10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12
13#define APIC_ID 0x20
14#define APIC_ID_MASK (0xFFu<<24)
15#define GET_APIC_ID(x) (((x)>>24)&0xFFu)
16#define SET_APIC_ID(x) (((x)<<24))
17#define APIC_LVR 0x30
18#define APIC_LVR_MASK 0xFF00FF
19#define GET_APIC_VERSION(x) ((x)&0xFFu)
20#define GET_APIC_MAXLVT(x) (((x)>>16)&0xFFu)
21#define APIC_INTEGRATED(x) ((x)&0xF0u)
22#define APIC_TASKPRI 0x80
23#define APIC_TPRI_MASK 0xFFu
24#define APIC_ARBPRI 0x90
25#define APIC_ARBPRI_MASK 0xFFu
26#define APIC_PROCPRI 0xA0
27#define APIC_EOI 0xB0
28#define APIC_EIO_ACK 0x0 /* Write this to the EOI register */
29#define APIC_RRR 0xC0
30#define APIC_LDR 0xD0
31#define APIC_LDR_MASK (0xFFu<<24)
32#define GET_APIC_LOGICAL_ID(x) (((x)>>24)&0xFFu)
33#define SET_APIC_LOGICAL_ID(x) (((x)<<24))
34#define APIC_ALL_CPUS 0xFFu
35#define APIC_DFR 0xE0
36#define APIC_DFR_CLUSTER 0x0FFFFFFFul
37#define APIC_DFR_FLAT 0xFFFFFFFFul
38#define APIC_SPIV 0xF0
39#define APIC_SPIV_FOCUS_DISABLED (1<<9)
40#define APIC_SPIV_APIC_ENABLED (1<<8)
41#define APIC_ISR 0x100
42#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
43#define APIC_TMR 0x180
44#define APIC_IRR 0x200
45#define APIC_ESR 0x280
46#define APIC_ESR_SEND_CS 0x00001
47#define APIC_ESR_RECV_CS 0x00002
48#define APIC_ESR_SEND_ACC 0x00004
49#define APIC_ESR_RECV_ACC 0x00008
50#define APIC_ESR_SENDILL 0x00020
51#define APIC_ESR_RECVILL 0x00040
52#define APIC_ESR_ILLREGA 0x00080
53#define APIC_ICR 0x300
54#define APIC_DEST_SELF 0x40000
55#define APIC_DEST_ALLINC 0x80000
56#define APIC_DEST_ALLBUT 0xC0000
57#define APIC_ICR_RR_MASK 0x30000
58#define APIC_ICR_RR_INVALID 0x00000
59#define APIC_ICR_RR_INPROG 0x10000
60#define APIC_ICR_RR_VALID 0x20000
61#define APIC_INT_LEVELTRIG 0x08000
62#define APIC_INT_ASSERT 0x04000
63#define APIC_ICR_BUSY 0x01000
64#define APIC_DEST_LOGICAL 0x00800
65#define APIC_DEST_PHYSICAL 0x00000
66#define APIC_DM_FIXED 0x00000
67#define APIC_DM_LOWEST 0x00100
68#define APIC_DM_SMI 0x00200
69#define APIC_DM_REMRD 0x00300
70#define APIC_DM_NMI 0x00400
71#define APIC_DM_INIT 0x00500
72#define APIC_DM_STARTUP 0x00600
73#define APIC_DM_EXTINT 0x00700
74#define APIC_VECTOR_MASK 0x000FF
75#define APIC_ICR2 0x310
76#define GET_APIC_DEST_FIELD(x) (((x)>>24)&0xFF)
77#define SET_APIC_DEST_FIELD(x) ((x)<<24)
78#define APIC_LVTT 0x320
79#define APIC_LVTTHMR 0x330
80#define APIC_LVTPC 0x340
81#define APIC_LVT0 0x350
82#define APIC_LVT_TIMER_BASE_MASK (0x3<<18)
83#define GET_APIC_TIMER_BASE(x) (((x)>>18)&0x3)
84#define SET_APIC_TIMER_BASE(x) (((x)<<18))
85#define APIC_TIMER_BASE_CLKIN 0x0
86#define APIC_TIMER_BASE_TMBASE 0x1
87#define APIC_TIMER_BASE_DIV 0x2
88#define APIC_LVT_TIMER_PERIODIC (1<<17)
89#define APIC_LVT_MASKED (1<<16)
90#define APIC_LVT_LEVEL_TRIGGER (1<<15)
91#define APIC_LVT_REMOTE_IRR (1<<14)
92#define APIC_INPUT_POLARITY (1<<13)
93#define APIC_SEND_PENDING (1<<12)
94#define APIC_MODE_MASK 0x700
95#define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
96#define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
97#define APIC_MODE_FIXED 0x0
98#define APIC_MODE_NMI 0x4
99#define APIC_MODE_EXTINT 0x7
100#define APIC_LVT1 0x360
101#define APIC_LVTERR 0x370
102#define APIC_TMICT 0x380
103#define APIC_TMCCT 0x390
104#define APIC_TDCR 0x3E0
105#define APIC_TDR_DIV_TMBASE (1<<2)
106#define APIC_TDR_DIV_1 0xB
107#define APIC_TDR_DIV_2 0x0
108#define APIC_TDR_DIV_4 0x1
109#define APIC_TDR_DIV_8 0x2
110#define APIC_TDR_DIV_16 0x3
111#define APIC_TDR_DIV_32 0x8
112#define APIC_TDR_DIV_64 0x9
113#define APIC_TDR_DIV_128 0xA
114
115#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
116
117#define MAX_IO_APICS 128
118#define MAX_LOCAL_APIC 256
119
120/*
121 * All x86-64 systems are xAPIC compatible.
122 * In the following, "apicid" is a physical APIC ID.
123 */
124#define XAPIC_DEST_CPUS_SHIFT 4
125#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
126#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
127#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
128#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
129#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
130#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
131
132/*
133 * the local APIC register structure, memory mapped. Not terribly well
134 * tested, but we might eventually use this one in the future - the
135 * problem why we cannot use it right now is the P5 APIC, it has an
136 * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
137 */
138#define u32 unsigned int
139
140struct local_apic {
141
142/*000*/ struct { u32 __reserved[4]; } __reserved_01;
143
144/*010*/ struct { u32 __reserved[4]; } __reserved_02;
145
146/*020*/ struct { /* APIC ID Register */
147 u32 __reserved_1 : 24,
148 phys_apic_id : 4,
149 __reserved_2 : 4;
150 u32 __reserved[3];
151 } id;
152
153/*030*/ const
154 struct { /* APIC Version Register */
155 u32 version : 8,
156 __reserved_1 : 8,
157 max_lvt : 8,
158 __reserved_2 : 8;
159 u32 __reserved[3];
160 } version;
161
162/*040*/ struct { u32 __reserved[4]; } __reserved_03;
163
164/*050*/ struct { u32 __reserved[4]; } __reserved_04;
165
166/*060*/ struct { u32 __reserved[4]; } __reserved_05;
167
168/*070*/ struct { u32 __reserved[4]; } __reserved_06;
169
170/*080*/ struct { /* Task Priority Register */
171 u32 priority : 8,
172 __reserved_1 : 24;
173 u32 __reserved_2[3];
174 } tpr;
175
176/*090*/ const
177 struct { /* Arbitration Priority Register */
178 u32 priority : 8,
179 __reserved_1 : 24;
180 u32 __reserved_2[3];
181 } apr;
182
183/*0A0*/ const
184 struct { /* Processor Priority Register */
185 u32 priority : 8,
186 __reserved_1 : 24;
187 u32 __reserved_2[3];
188 } ppr;
189
190/*0B0*/ struct { /* End Of Interrupt Register */
191 u32 eoi;
192 u32 __reserved[3];
193 } eoi;
194
195/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
196
197/*0D0*/ struct { /* Logical Destination Register */
198 u32 __reserved_1 : 24,
199 logical_dest : 8;
200 u32 __reserved_2[3];
201 } ldr;
202
203/*0E0*/ struct { /* Destination Format Register */
204 u32 __reserved_1 : 28,
205 model : 4;
206 u32 __reserved_2[3];
207 } dfr;
208
209/*0F0*/ struct { /* Spurious Interrupt Vector Register */
210 u32 spurious_vector : 8,
211 apic_enabled : 1,
212 focus_cpu : 1,
213 __reserved_2 : 22;
214 u32 __reserved_3[3];
215 } svr;
216
217/*100*/ struct { /* In Service Register */
218/*170*/ u32 bitfield;
219 u32 __reserved[3];
220 } isr [8];
221
222/*180*/ struct { /* Trigger Mode Register */
223/*1F0*/ u32 bitfield;
224 u32 __reserved[3];
225 } tmr [8];
226
227/*200*/ struct { /* Interrupt Request Register */
228/*270*/ u32 bitfield;
229 u32 __reserved[3];
230 } irr [8];
231
232/*280*/ union { /* Error Status Register */
233 struct {
234 u32 send_cs_error : 1,
235 receive_cs_error : 1,
236 send_accept_error : 1,
237 receive_accept_error : 1,
238 __reserved_1 : 1,
239 send_illegal_vector : 1,
240 receive_illegal_vector : 1,
241 illegal_register_address : 1,
242 __reserved_2 : 24;
243 u32 __reserved_3[3];
244 } error_bits;
245 struct {
246 u32 errors;
247 u32 __reserved_3[3];
248 } all_errors;
249 } esr;
250
251/*290*/ struct { u32 __reserved[4]; } __reserved_08;
252
253/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
254
255/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
256
257/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
258
259/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
260
261/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
262
263/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
264
265/*300*/ struct { /* Interrupt Command Register 1 */
266 u32 vector : 8,
267 delivery_mode : 3,
268 destination_mode : 1,
269 delivery_status : 1,
270 __reserved_1 : 1,
271 level : 1,
272 trigger : 1,
273 __reserved_2 : 2,
274 shorthand : 2,
275 __reserved_3 : 12;
276 u32 __reserved_4[3];
277 } icr1;
278
279/*310*/ struct { /* Interrupt Command Register 2 */
280 union {
281 u32 __reserved_1 : 24,
282 phys_dest : 4,
283 __reserved_2 : 4;
284 u32 __reserved_3 : 24,
285 logical_dest : 8;
286 } dest;
287 u32 __reserved_4[3];
288 } icr2;
289
290/*320*/ struct { /* LVT - Timer */
291 u32 vector : 8,
292 __reserved_1 : 4,
293 delivery_status : 1,
294 __reserved_2 : 3,
295 mask : 1,
296 timer_mode : 1,
297 __reserved_3 : 14;
298 u32 __reserved_4[3];
299 } lvt_timer;
300
301/*330*/ struct { /* LVT - Thermal Sensor */
302 u32 vector : 8,
303 delivery_mode : 3,
304 __reserved_1 : 1,
305 delivery_status : 1,
306 __reserved_2 : 3,
307 mask : 1,
308 __reserved_3 : 15;
309 u32 __reserved_4[3];
310 } lvt_thermal;
311
312/*340*/ struct { /* LVT - Performance Counter */
313 u32 vector : 8,
314 delivery_mode : 3,
315 __reserved_1 : 1,
316 delivery_status : 1,
317 __reserved_2 : 3,
318 mask : 1,
319 __reserved_3 : 15;
320 u32 __reserved_4[3];
321 } lvt_pc;
322
323/*350*/ struct { /* LVT - LINT0 */
324 u32 vector : 8,
325 delivery_mode : 3,
326 __reserved_1 : 1,
327 delivery_status : 1,
328 polarity : 1,
329 remote_irr : 1,
330 trigger : 1,
331 mask : 1,
332 __reserved_2 : 15;
333 u32 __reserved_3[3];
334 } lvt_lint0;
335
336/*360*/ struct { /* LVT - LINT1 */
337 u32 vector : 8,
338 delivery_mode : 3,
339 __reserved_1 : 1,
340 delivery_status : 1,
341 polarity : 1,
342 remote_irr : 1,
343 trigger : 1,
344 mask : 1,
345 __reserved_2 : 15;
346 u32 __reserved_3[3];
347 } lvt_lint1;
348
349/*370*/ struct { /* LVT - Error */
350 u32 vector : 8,
351 __reserved_1 : 4,
352 delivery_status : 1,
353 __reserved_2 : 3,
354 mask : 1,
355 __reserved_3 : 15;
356 u32 __reserved_4[3];
357 } lvt_error;
358
359/*380*/ struct { /* Timer Initial Count Register */
360 u32 initial_count;
361 u32 __reserved_2[3];
362 } timer_icr;
363
364/*390*/ const
365 struct { /* Timer Current Count Register */
366 u32 curr_count;
367 u32 __reserved_2[3];
368 } timer_ccr;
369
370/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
371
372/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
373
374/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
375
376/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
377
378/*3E0*/ struct { /* Timer Divide Configuration Register */
379 u32 divisor : 4,
380 __reserved_1 : 28;
381 u32 __reserved_2[3];
382 } timer_dcr;
383
384/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
385
386} __attribute__ ((packed));
387
388#undef u32
389
390#define BAD_APICID 0xFFu
391
392#endif
diff --git a/include/asm-x86/arch_hooks.h b/include/asm-x86/arch_hooks.h
index a8c1fca9726d..768aee8a04ef 100644
--- a/include/asm-x86/arch_hooks.h
+++ b/include/asm-x86/arch_hooks.h
@@ -6,7 +6,7 @@
6/* 6/*
7 * linux/include/asm/arch_hooks.h 7 * linux/include/asm/arch_hooks.h
8 * 8 *
9 * define the architecture specific hooks 9 * define the architecture specific hooks
10 */ 10 */
11 11
12/* these aren't arch hooks, they are generic routines 12/* these aren't arch hooks, they are generic routines
@@ -24,7 +24,4 @@ extern void trap_init_hook(void);
24extern void time_init_hook(void); 24extern void time_init_hook(void);
25extern void mca_nmi_hook(void); 25extern void mca_nmi_hook(void);
26 26
27extern int setup_early_printk(char *);
28extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
29
30#endif 27#endif
diff --git a/include/asm-x86/asm.h b/include/asm-x86/asm.h
new file mode 100644
index 000000000000..1a6980a60fc6
--- /dev/null
+++ b/include/asm-x86/asm.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_ASM_H
2#define _ASM_X86_ASM_H
3
4#ifdef CONFIG_X86_32
5/* 32 bits */
6
7# define _ASM_PTR " .long "
8# define _ASM_ALIGN " .balign 4 "
9# define _ASM_MOV_UL " movl "
10
11# define _ASM_INC " incl "
12# define _ASM_DEC " decl "
13# define _ASM_ADD " addl "
14# define _ASM_SUB " subl "
15# define _ASM_XADD " xaddl "
16
17#else
18/* 64 bits */
19
20# define _ASM_PTR " .quad "
21# define _ASM_ALIGN " .balign 8 "
22# define _ASM_MOV_UL " movq "
23
24# define _ASM_INC " incq "
25# define _ASM_DEC " decq "
26# define _ASM_ADD " addq "
27# define _ASM_SUB " subq "
28# define _ASM_XADD " xaddq "
29
30#endif /* CONFIG_X86_32 */
31
32#endif /* _ASM_X86_ASM_H */
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h
index 07e3f6d4fe47..1a23ce1a5697 100644
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -1,5 +1,321 @@
1#ifndef _ASM_X86_BITOPS_H
2#define _ASM_X86_BITOPS_H
3
4/*
5 * Copyright 1992, Linus Torvalds.
6 */
7
8#ifndef _LINUX_BITOPS_H
9#error only <linux/bitops.h> can be included directly
10#endif
11
12#include <linux/compiler.h>
13#include <asm/alternative.h>
14
15/*
16 * These have to be done with inline assembly: that way the bit-setting
17 * is guaranteed to be atomic. All bit operations return 0 if the bit
18 * was cleared before the operation and != 0 if it was not.
19 *
20 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
21 */
22
23#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
24/* Technically wrong, but this avoids compilation errors on some gcc
25 versions. */
26#define ADDR "=m" (*(volatile long *) addr)
27#else
28#define ADDR "+m" (*(volatile long *) addr)
29#endif
30
31/**
32 * set_bit - Atomically set a bit in memory
33 * @nr: the bit to set
34 * @addr: the address to start counting from
35 *
36 * This function is atomic and may not be reordered. See __set_bit()
37 * if you do not require the atomic guarantees.
38 *
39 * Note: there are no guarantees that this function will not be reordered
40 * on non x86 architectures, so if you are writing portable code,
41 * make sure not to rely on its reordering guarantees.
42 *
43 * Note that @nr may be almost arbitrarily large; this function is not
44 * restricted to acting on a single-word quantity.
45 */
46static inline void set_bit(int nr, volatile void *addr)
47{
48 asm volatile(LOCK_PREFIX "bts %1,%0"
49 : ADDR
50 : "Ir" (nr) : "memory");
51}
52
53/**
54 * __set_bit - Set a bit in memory
55 * @nr: the bit to set
56 * @addr: the address to start counting from
57 *
58 * Unlike set_bit(), this function is non-atomic and may be reordered.
59 * If it's called on the same region of memory simultaneously, the effect
60 * may be that only one operation succeeds.
61 */
62static inline void __set_bit(int nr, volatile void *addr)
63{
64 asm volatile("bts %1,%0"
65 : ADDR
66 : "Ir" (nr) : "memory");
67}
68
69
70/**
71 * clear_bit - Clears a bit in memory
72 * @nr: Bit to clear
73 * @addr: Address to start counting from
74 *
75 * clear_bit() is atomic and may not be reordered. However, it does
76 * not contain a memory barrier, so if it is used for locking purposes,
77 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
78 * in order to ensure changes are visible on other processors.
79 */
80static inline void clear_bit(int nr, volatile void *addr)
81{
82 asm volatile(LOCK_PREFIX "btr %1,%0"
83 : ADDR
84 : "Ir" (nr));
85}
86
87/*
88 * clear_bit_unlock - Clears a bit in memory
89 * @nr: Bit to clear
90 * @addr: Address to start counting from
91 *
92 * clear_bit() is atomic and implies release semantics before the memory
93 * operation. It can be used for an unlock.
94 */
95static inline void clear_bit_unlock(unsigned nr, volatile void *addr)
96{
97 barrier();
98 clear_bit(nr, addr);
99}
100
101static inline void __clear_bit(int nr, volatile void *addr)
102{
103 asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
104}
105
106/*
107 * __clear_bit_unlock - Clears a bit in memory
108 * @nr: Bit to clear
109 * @addr: Address to start counting from
110 *
111 * __clear_bit() is non-atomic and implies release semantics before the memory
112 * operation. It can be used for an unlock if no other CPUs can concurrently
113 * modify other bits in the word.
114 *
115 * No memory barrier is required here, because x86 cannot reorder stores past
116 * older loads. Same principle as spin_unlock.
117 */
118static inline void __clear_bit_unlock(unsigned nr, volatile void *addr)
119{
120 barrier();
121 __clear_bit(nr, addr);
122}
123
124#define smp_mb__before_clear_bit() barrier()
125#define smp_mb__after_clear_bit() barrier()
126
127/**
128 * __change_bit - Toggle a bit in memory
129 * @nr: the bit to change
130 * @addr: the address to start counting from
131 *
132 * Unlike change_bit(), this function is non-atomic and may be reordered.
133 * If it's called on the same region of memory simultaneously, the effect
134 * may be that only one operation succeeds.
135 */
136static inline void __change_bit(int nr, volatile void *addr)
137{
138 asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
139}
140
141/**
142 * change_bit - Toggle a bit in memory
143 * @nr: Bit to change
144 * @addr: Address to start counting from
145 *
146 * change_bit() is atomic and may not be reordered.
147 * Note that @nr may be almost arbitrarily large; this function is not
148 * restricted to acting on a single-word quantity.
149 */
150static inline void change_bit(int nr, volatile void *addr)
151{
152 asm volatile(LOCK_PREFIX "btc %1,%0"
153 : ADDR : "Ir" (nr));
154}
155
156/**
157 * test_and_set_bit - Set a bit and return its old value
158 * @nr: Bit to set
159 * @addr: Address to count from
160 *
161 * This operation is atomic and cannot be reordered.
162 * It also implies a memory barrier.
163 */
164static inline int test_and_set_bit(int nr, volatile void *addr)
165{
166 int oldbit;
167
168 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
169 "sbb %0,%0"
170 : "=r" (oldbit), ADDR
171 : "Ir" (nr) : "memory");
172
173 return oldbit;
174}
175
176/**
177 * test_and_set_bit_lock - Set a bit and return its old value for lock
178 * @nr: Bit to set
179 * @addr: Address to count from
180 *
181 * This is the same as test_and_set_bit on x86.
182 */
183static inline int test_and_set_bit_lock(int nr, volatile void *addr)
184{
185 return test_and_set_bit(nr, addr);
186}
187
188/**
189 * __test_and_set_bit - Set a bit and return its old value
190 * @nr: Bit to set
191 * @addr: Address to count from
192 *
193 * This operation is non-atomic and can be reordered.
194 * If two examples of this operation race, one can appear to succeed
195 * but actually fail. You must protect multiple accesses with a lock.
196 */
197static inline int __test_and_set_bit(int nr, volatile void *addr)
198{
199 int oldbit;
200
201 asm("bts %2,%1\n\t"
202 "sbb %0,%0"
203 : "=r" (oldbit), ADDR
204 : "Ir" (nr));
205 return oldbit;
206}
207
208/**
209 * test_and_clear_bit - Clear a bit and return its old value
210 * @nr: Bit to clear
211 * @addr: Address to count from
212 *
213 * This operation is atomic and cannot be reordered.
214 * It also implies a memory barrier.
215 */
216static inline int test_and_clear_bit(int nr, volatile void *addr)
217{
218 int oldbit;
219
220 asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
221 "sbb %0,%0"
222 : "=r" (oldbit), ADDR
223 : "Ir" (nr) : "memory");
224
225 return oldbit;
226}
227
228/**
229 * __test_and_clear_bit - Clear a bit and return its old value
230 * @nr: Bit to clear
231 * @addr: Address to count from
232 *
233 * This operation is non-atomic and can be reordered.
234 * If two examples of this operation race, one can appear to succeed
235 * but actually fail. You must protect multiple accesses with a lock.
236 */
237static inline int __test_and_clear_bit(int nr, volatile void *addr)
238{
239 int oldbit;
240
241 asm volatile("btr %2,%1\n\t"
242 "sbb %0,%0"
243 : "=r" (oldbit), ADDR
244 : "Ir" (nr));
245 return oldbit;
246}
247
248/* WARNING: non atomic and it can be reordered! */
249static inline int __test_and_change_bit(int nr, volatile void *addr)
250{
251 int oldbit;
252
253 asm volatile("btc %2,%1\n\t"
254 "sbb %0,%0"
255 : "=r" (oldbit), ADDR
256 : "Ir" (nr) : "memory");
257
258 return oldbit;
259}
260
261/**
262 * test_and_change_bit - Change a bit and return its old value
263 * @nr: Bit to change
264 * @addr: Address to count from
265 *
266 * This operation is atomic and cannot be reordered.
267 * It also implies a memory barrier.
268 */
269static inline int test_and_change_bit(int nr, volatile void *addr)
270{
271 int oldbit;
272
273 asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
274 "sbb %0,%0"
275 : "=r" (oldbit), ADDR
276 : "Ir" (nr) : "memory");
277
278 return oldbit;
279}
280
281static inline int constant_test_bit(int nr, const volatile void *addr)
282{
283 return ((1UL << (nr % BITS_PER_LONG)) &
284 (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
285}
286
287static inline int variable_test_bit(int nr, volatile const void *addr)
288{
289 int oldbit;
290
291 asm volatile("bt %2,%1\n\t"
292 "sbb %0,%0"
293 : "=r" (oldbit)
294 : "m" (*(unsigned long *)addr), "Ir" (nr));
295
296 return oldbit;
297}
298
299#if 0 /* Fool kernel-doc since it doesn't do macros yet */
300/**
301 * test_bit - Determine whether a bit is set
302 * @nr: bit number to test
303 * @addr: Address to start counting from
304 */
305static int test_bit(int nr, const volatile unsigned long *addr);
306#endif
307
308#define test_bit(nr,addr) \
309 (__builtin_constant_p(nr) ? \
310 constant_test_bit((nr),(addr)) : \
311 variable_test_bit((nr),(addr)))
312
313#undef ADDR
314
1#ifdef CONFIG_X86_32 315#ifdef CONFIG_X86_32
2# include "bitops_32.h" 316# include "bitops_32.h"
3#else 317#else
4# include "bitops_64.h" 318# include "bitops_64.h"
5#endif 319#endif
320
321#endif /* _ASM_X86_BITOPS_H */
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h
index 0b40f6d20bea..e4d75fcf9c03 100644
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -5,320 +5,12 @@
5 * Copyright 1992, Linus Torvalds. 5 * Copyright 1992, Linus Torvalds.
6 */ 6 */
7 7
8#ifndef _LINUX_BITOPS_H
9#error only <linux/bitops.h> can be included directly
10#endif
11
12#include <linux/compiler.h>
13#include <asm/alternative.h>
14
15/*
16 * These have to be done with inline assembly: that way the bit-setting
17 * is guaranteed to be atomic. All bit operations return 0 if the bit
18 * was cleared before the operation and != 0 if it was not.
19 *
20 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
21 */
22
23#define ADDR (*(volatile long *) addr)
24
25/**
26 * set_bit - Atomically set a bit in memory
27 * @nr: the bit to set
28 * @addr: the address to start counting from
29 *
30 * This function is atomic and may not be reordered. See __set_bit()
31 * if you do not require the atomic guarantees.
32 *
33 * Note: there are no guarantees that this function will not be reordered
34 * on non x86 architectures, so if you are writing portable code,
35 * make sure not to rely on its reordering guarantees.
36 *
37 * Note that @nr may be almost arbitrarily large; this function is not
38 * restricted to acting on a single-word quantity.
39 */
40static inline void set_bit(int nr, volatile unsigned long * addr)
41{
42 __asm__ __volatile__( LOCK_PREFIX
43 "btsl %1,%0"
44 :"+m" (ADDR)
45 :"Ir" (nr));
46}
47
48/**
49 * __set_bit - Set a bit in memory
50 * @nr: the bit to set
51 * @addr: the address to start counting from
52 *
53 * Unlike set_bit(), this function is non-atomic and may be reordered.
54 * If it's called on the same region of memory simultaneously, the effect
55 * may be that only one operation succeeds.
56 */
57static inline void __set_bit(int nr, volatile unsigned long * addr)
58{
59 __asm__(
60 "btsl %1,%0"
61 :"+m" (ADDR)
62 :"Ir" (nr));
63}
64
65/**
66 * clear_bit - Clears a bit in memory
67 * @nr: Bit to clear
68 * @addr: Address to start counting from
69 *
70 * clear_bit() is atomic and may not be reordered. However, it does
71 * not contain a memory barrier, so if it is used for locking purposes,
72 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
73 * in order to ensure changes are visible on other processors.
74 */
75static inline void clear_bit(int nr, volatile unsigned long * addr)
76{
77 __asm__ __volatile__( LOCK_PREFIX
78 "btrl %1,%0"
79 :"+m" (ADDR)
80 :"Ir" (nr));
81}
82
83/*
84 * clear_bit_unlock - Clears a bit in memory
85 * @nr: Bit to clear
86 * @addr: Address to start counting from
87 *
88 * clear_bit() is atomic and implies release semantics before the memory
89 * operation. It can be used for an unlock.
90 */
91static inline void clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
92{
93 barrier();
94 clear_bit(nr, addr);
95}
96
97static inline void __clear_bit(int nr, volatile unsigned long * addr)
98{
99 __asm__ __volatile__(
100 "btrl %1,%0"
101 :"+m" (ADDR)
102 :"Ir" (nr));
103}
104
105/*
106 * __clear_bit_unlock - Clears a bit in memory
107 * @nr: Bit to clear
108 * @addr: Address to start counting from
109 *
110 * __clear_bit() is non-atomic and implies release semantics before the memory
111 * operation. It can be used for an unlock if no other CPUs can concurrently
112 * modify other bits in the word.
113 *
114 * No memory barrier is required here, because x86 cannot reorder stores past
115 * older loads. Same principle as spin_unlock.
116 */
117static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
118{
119 barrier();
120 __clear_bit(nr, addr);
121}
122
123#define smp_mb__before_clear_bit() barrier()
124#define smp_mb__after_clear_bit() barrier()
125
126/**
127 * __change_bit - Toggle a bit in memory
128 * @nr: the bit to change
129 * @addr: the address to start counting from
130 *
131 * Unlike change_bit(), this function is non-atomic and may be reordered.
132 * If it's called on the same region of memory simultaneously, the effect
133 * may be that only one operation succeeds.
134 */
135static inline void __change_bit(int nr, volatile unsigned long * addr)
136{
137 __asm__ __volatile__(
138 "btcl %1,%0"
139 :"+m" (ADDR)
140 :"Ir" (nr));
141}
142
143/**
144 * change_bit - Toggle a bit in memory
145 * @nr: Bit to change
146 * @addr: Address to start counting from
147 *
148 * change_bit() is atomic and may not be reordered. It may be
149 * reordered on other architectures than x86.
150 * Note that @nr may be almost arbitrarily large; this function is not
151 * restricted to acting on a single-word quantity.
152 */
153static inline void change_bit(int nr, volatile unsigned long * addr)
154{
155 __asm__ __volatile__( LOCK_PREFIX
156 "btcl %1,%0"
157 :"+m" (ADDR)
158 :"Ir" (nr));
159}
160
161/**
162 * test_and_set_bit - Set a bit and return its old value
163 * @nr: Bit to set
164 * @addr: Address to count from
165 *
166 * This operation is atomic and cannot be reordered.
167 * It may be reordered on other architectures than x86.
168 * It also implies a memory barrier.
169 */
170static inline int test_and_set_bit(int nr, volatile unsigned long * addr)
171{
172 int oldbit;
173
174 __asm__ __volatile__( LOCK_PREFIX
175 "btsl %2,%1\n\tsbbl %0,%0"
176 :"=r" (oldbit),"+m" (ADDR)
177 :"Ir" (nr) : "memory");
178 return oldbit;
179}
180
181/**
182 * test_and_set_bit_lock - Set a bit and return its old value for lock
183 * @nr: Bit to set
184 * @addr: Address to count from
185 *
186 * This is the same as test_and_set_bit on x86.
187 */
188static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
189{
190 return test_and_set_bit(nr, addr);
191}
192
193/**
194 * __test_and_set_bit - Set a bit and return its old value
195 * @nr: Bit to set
196 * @addr: Address to count from
197 *
198 * This operation is non-atomic and can be reordered.
199 * If two examples of this operation race, one can appear to succeed
200 * but actually fail. You must protect multiple accesses with a lock.
201 */
202static inline int __test_and_set_bit(int nr, volatile unsigned long * addr)
203{
204 int oldbit;
205
206 __asm__(
207 "btsl %2,%1\n\tsbbl %0,%0"
208 :"=r" (oldbit),"+m" (ADDR)
209 :"Ir" (nr));
210 return oldbit;
211}
212
213/**
214 * test_and_clear_bit - Clear a bit and return its old value
215 * @nr: Bit to clear
216 * @addr: Address to count from
217 *
218 * This operation is atomic and cannot be reordered.
219 * It can be reorderdered on other architectures other than x86.
220 * It also implies a memory barrier.
221 */
222static inline int test_and_clear_bit(int nr, volatile unsigned long * addr)
223{
224 int oldbit;
225
226 __asm__ __volatile__( LOCK_PREFIX
227 "btrl %2,%1\n\tsbbl %0,%0"
228 :"=r" (oldbit),"+m" (ADDR)
229 :"Ir" (nr) : "memory");
230 return oldbit;
231}
232
233/**
234 * __test_and_clear_bit - Clear a bit and return its old value
235 * @nr: Bit to clear
236 * @addr: Address to count from
237 *
238 * This operation is non-atomic and can be reordered.
239 * If two examples of this operation race, one can appear to succeed
240 * but actually fail. You must protect multiple accesses with a lock.
241 */
242static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
243{
244 int oldbit;
245
246 __asm__(
247 "btrl %2,%1\n\tsbbl %0,%0"
248 :"=r" (oldbit),"+m" (ADDR)
249 :"Ir" (nr));
250 return oldbit;
251}
252
253/* WARNING: non atomic and it can be reordered! */
254static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
255{
256 int oldbit;
257
258 __asm__ __volatile__(
259 "btcl %2,%1\n\tsbbl %0,%0"
260 :"=r" (oldbit),"+m" (ADDR)
261 :"Ir" (nr) : "memory");
262 return oldbit;
263}
264
265/**
266 * test_and_change_bit - Change a bit and return its old value
267 * @nr: Bit to change
268 * @addr: Address to count from
269 *
270 * This operation is atomic and cannot be reordered.
271 * It also implies a memory barrier.
272 */
273static inline int test_and_change_bit(int nr, volatile unsigned long* addr)
274{
275 int oldbit;
276
277 __asm__ __volatile__( LOCK_PREFIX
278 "btcl %2,%1\n\tsbbl %0,%0"
279 :"=r" (oldbit),"+m" (ADDR)
280 :"Ir" (nr) : "memory");
281 return oldbit;
282}
283
284#if 0 /* Fool kernel-doc since it doesn't do macros yet */
285/**
286 * test_bit - Determine whether a bit is set
287 * @nr: bit number to test
288 * @addr: Address to start counting from
289 */
290static int test_bit(int nr, const volatile void * addr);
291#endif
292
293static __always_inline int constant_test_bit(int nr, const volatile unsigned long *addr)
294{
295 return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
296}
297
298static inline int variable_test_bit(int nr, const volatile unsigned long * addr)
299{
300 int oldbit;
301
302 __asm__ __volatile__(
303 "btl %2,%1\n\tsbbl %0,%0"
304 :"=r" (oldbit)
305 :"m" (ADDR),"Ir" (nr));
306 return oldbit;
307}
308
309#define test_bit(nr,addr) \
310(__builtin_constant_p(nr) ? \
311 constant_test_bit((nr),(addr)) : \
312 variable_test_bit((nr),(addr)))
313
314#undef ADDR
315
316/** 8/**
317 * find_first_zero_bit - find the first zero bit in a memory region 9 * find_first_zero_bit - find the first zero bit in a memory region
318 * @addr: The address to start the search at 10 * @addr: The address to start the search at
319 * @size: The maximum size to search 11 * @size: The maximum size to search
320 * 12 *
321 * Returns the bit-number of the first zero bit, not the number of the byte 13 * Returns the bit number of the first zero bit, not the number of the byte
322 * containing a bit. 14 * containing a bit.
323 */ 15 */
324static inline int find_first_zero_bit(const unsigned long *addr, unsigned size) 16static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
@@ -348,7 +40,7 @@ static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
348/** 40/**
349 * find_next_zero_bit - find the first zero bit in a memory region 41 * find_next_zero_bit - find the first zero bit in a memory region
350 * @addr: The address to base the search on 42 * @addr: The address to base the search on
351 * @offset: The bitnumber to start searching at 43 * @offset: The bit number to start searching at
352 * @size: The maximum size to search 44 * @size: The maximum size to search
353 */ 45 */
354int find_next_zero_bit(const unsigned long *addr, int size, int offset); 46int find_next_zero_bit(const unsigned long *addr, int size, int offset);
@@ -372,7 +64,7 @@ static inline unsigned long __ffs(unsigned long word)
372 * @addr: The address to start the search at 64 * @addr: The address to start the search at
373 * @size: The maximum size to search 65 * @size: The maximum size to search
374 * 66 *
375 * Returns the bit-number of the first set bit, not the number of the byte 67 * Returns the bit number of the first set bit, not the number of the byte
376 * containing a bit. 68 * containing a bit.
377 */ 69 */
378static inline unsigned find_first_bit(const unsigned long *addr, unsigned size) 70static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
@@ -391,7 +83,7 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
391/** 83/**
392 * find_next_bit - find the first set bit in a memory region 84 * find_next_bit - find the first set bit in a memory region
393 * @addr: The address to base the search on 85 * @addr: The address to base the search on
394 * @offset: The bitnumber to start searching at 86 * @offset: The bit number to start searching at
395 * @size: The maximum size to search 87 * @size: The maximum size to search
396 */ 88 */
397int find_next_bit(const unsigned long *addr, int size, int offset); 89int find_next_bit(const unsigned long *addr, int size, int offset);
@@ -460,10 +152,10 @@ static inline int fls(int x)
460 152
461#include <asm-generic/bitops/ext2-non-atomic.h> 153#include <asm-generic/bitops/ext2-non-atomic.h>
462 154
463#define ext2_set_bit_atomic(lock,nr,addr) \ 155#define ext2_set_bit_atomic(lock, nr, addr) \
464 test_and_set_bit((nr),(unsigned long*)addr) 156 test_and_set_bit((nr), (unsigned long *)addr)
465#define ext2_clear_bit_atomic(lock,nr, addr) \ 157#define ext2_clear_bit_atomic(lock, nr, addr) \
466 test_and_clear_bit((nr),(unsigned long*)addr) 158 test_and_clear_bit((nr), (unsigned long *)addr)
467 159
468#include <asm-generic/bitops/minix.h> 160#include <asm-generic/bitops/minix.h>
469 161
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 766bcc0470a6..48adbf56ca60 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -5,303 +5,6 @@
5 * Copyright 1992, Linus Torvalds. 5 * Copyright 1992, Linus Torvalds.
6 */ 6 */
7 7
8#ifndef _LINUX_BITOPS_H
9#error only <linux/bitops.h> can be included directly
10#endif
11
12#include <asm/alternative.h>
13
14#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
15/* Technically wrong, but this avoids compilation errors on some gcc
16 versions. */
17#define ADDR "=m" (*(volatile long *) addr)
18#else
19#define ADDR "+m" (*(volatile long *) addr)
20#endif
21
22/**
23 * set_bit - Atomically set a bit in memory
24 * @nr: the bit to set
25 * @addr: the address to start counting from
26 *
27 * This function is atomic and may not be reordered. See __set_bit()
28 * if you do not require the atomic guarantees.
29 * Note that @nr may be almost arbitrarily large; this function is not
30 * restricted to acting on a single-word quantity.
31 */
32static inline void set_bit(int nr, volatile void *addr)
33{
34 __asm__ __volatile__( LOCK_PREFIX
35 "btsl %1,%0"
36 :ADDR
37 :"dIr" (nr) : "memory");
38}
39
40/**
41 * __set_bit - Set a bit in memory
42 * @nr: the bit to set
43 * @addr: the address to start counting from
44 *
45 * Unlike set_bit(), this function is non-atomic and may be reordered.
46 * If it's called on the same region of memory simultaneously, the effect
47 * may be that only one operation succeeds.
48 */
49static inline void __set_bit(int nr, volatile void *addr)
50{
51 __asm__ volatile(
52 "btsl %1,%0"
53 :ADDR
54 :"dIr" (nr) : "memory");
55}
56
57/**
58 * clear_bit - Clears a bit in memory
59 * @nr: Bit to clear
60 * @addr: Address to start counting from
61 *
62 * clear_bit() is atomic and may not be reordered. However, it does
63 * not contain a memory barrier, so if it is used for locking purposes,
64 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
65 * in order to ensure changes are visible on other processors.
66 */
67static inline void clear_bit(int nr, volatile void *addr)
68{
69 __asm__ __volatile__( LOCK_PREFIX
70 "btrl %1,%0"
71 :ADDR
72 :"dIr" (nr));
73}
74
75/*
76 * clear_bit_unlock - Clears a bit in memory
77 * @nr: Bit to clear
78 * @addr: Address to start counting from
79 *
80 * clear_bit() is atomic and implies release semantics before the memory
81 * operation. It can be used for an unlock.
82 */
83static inline void clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
84{
85 barrier();
86 clear_bit(nr, addr);
87}
88
89static inline void __clear_bit(int nr, volatile void *addr)
90{
91 __asm__ __volatile__(
92 "btrl %1,%0"
93 :ADDR
94 :"dIr" (nr));
95}
96
97/*
98 * __clear_bit_unlock - Clears a bit in memory
99 * @nr: Bit to clear
100 * @addr: Address to start counting from
101 *
102 * __clear_bit() is non-atomic and implies release semantics before the memory
103 * operation. It can be used for an unlock if no other CPUs can concurrently
104 * modify other bits in the word.
105 *
106 * No memory barrier is required here, because x86 cannot reorder stores past
107 * older loads. Same principle as spin_unlock.
108 */
109static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
110{
111 barrier();
112 __clear_bit(nr, addr);
113}
114
115#define smp_mb__before_clear_bit() barrier()
116#define smp_mb__after_clear_bit() barrier()
117
118/**
119 * __change_bit - Toggle a bit in memory
120 * @nr: the bit to change
121 * @addr: the address to start counting from
122 *
123 * Unlike change_bit(), this function is non-atomic and may be reordered.
124 * If it's called on the same region of memory simultaneously, the effect
125 * may be that only one operation succeeds.
126 */
127static inline void __change_bit(int nr, volatile void *addr)
128{
129 __asm__ __volatile__(
130 "btcl %1,%0"
131 :ADDR
132 :"dIr" (nr));
133}
134
135/**
136 * change_bit - Toggle a bit in memory
137 * @nr: Bit to change
138 * @addr: Address to start counting from
139 *
140 * change_bit() is atomic and may not be reordered.
141 * Note that @nr may be almost arbitrarily large; this function is not
142 * restricted to acting on a single-word quantity.
143 */
144static inline void change_bit(int nr, volatile void *addr)
145{
146 __asm__ __volatile__( LOCK_PREFIX
147 "btcl %1,%0"
148 :ADDR
149 :"dIr" (nr));
150}
151
152/**
153 * test_and_set_bit - Set a bit and return its old value
154 * @nr: Bit to set
155 * @addr: Address to count from
156 *
157 * This operation is atomic and cannot be reordered.
158 * It also implies a memory barrier.
159 */
160static inline int test_and_set_bit(int nr, volatile void *addr)
161{
162 int oldbit;
163
164 __asm__ __volatile__( LOCK_PREFIX
165 "btsl %2,%1\n\tsbbl %0,%0"
166 :"=r" (oldbit),ADDR
167 :"dIr" (nr) : "memory");
168 return oldbit;
169}
170
171/**
172 * test_and_set_bit_lock - Set a bit and return its old value for lock
173 * @nr: Bit to set
174 * @addr: Address to count from
175 *
176 * This is the same as test_and_set_bit on x86.
177 */
178static inline int test_and_set_bit_lock(int nr, volatile void *addr)
179{
180 return test_and_set_bit(nr, addr);
181}
182
183/**
184 * __test_and_set_bit - Set a bit and return its old value
185 * @nr: Bit to set
186 * @addr: Address to count from
187 *
188 * This operation is non-atomic and can be reordered.
189 * If two examples of this operation race, one can appear to succeed
190 * but actually fail. You must protect multiple accesses with a lock.
191 */
192static inline int __test_and_set_bit(int nr, volatile void *addr)
193{
194 int oldbit;
195
196 __asm__(
197 "btsl %2,%1\n\tsbbl %0,%0"
198 :"=r" (oldbit),ADDR
199 :"dIr" (nr));
200 return oldbit;
201}
202
203/**
204 * test_and_clear_bit - Clear a bit and return its old value
205 * @nr: Bit to clear
206 * @addr: Address to count from
207 *
208 * This operation is atomic and cannot be reordered.
209 * It also implies a memory barrier.
210 */
211static inline int test_and_clear_bit(int nr, volatile void *addr)
212{
213 int oldbit;
214
215 __asm__ __volatile__( LOCK_PREFIX
216 "btrl %2,%1\n\tsbbl %0,%0"
217 :"=r" (oldbit),ADDR
218 :"dIr" (nr) : "memory");
219 return oldbit;
220}
221
222/**
223 * __test_and_clear_bit - Clear a bit and return its old value
224 * @nr: Bit to clear
225 * @addr: Address to count from
226 *
227 * This operation is non-atomic and can be reordered.
228 * If two examples of this operation race, one can appear to succeed
229 * but actually fail. You must protect multiple accesses with a lock.
230 */
231static inline int __test_and_clear_bit(int nr, volatile void *addr)
232{
233 int oldbit;
234
235 __asm__(
236 "btrl %2,%1\n\tsbbl %0,%0"
237 :"=r" (oldbit),ADDR
238 :"dIr" (nr));
239 return oldbit;
240}
241
242/* WARNING: non atomic and it can be reordered! */
243static inline int __test_and_change_bit(int nr, volatile void *addr)
244{
245 int oldbit;
246
247 __asm__ __volatile__(
248 "btcl %2,%1\n\tsbbl %0,%0"
249 :"=r" (oldbit),ADDR
250 :"dIr" (nr) : "memory");
251 return oldbit;
252}
253
254/**
255 * test_and_change_bit - Change a bit and return its old value
256 * @nr: Bit to change
257 * @addr: Address to count from
258 *
259 * This operation is atomic and cannot be reordered.
260 * It also implies a memory barrier.
261 */
262static inline int test_and_change_bit(int nr, volatile void *addr)
263{
264 int oldbit;
265
266 __asm__ __volatile__( LOCK_PREFIX
267 "btcl %2,%1\n\tsbbl %0,%0"
268 :"=r" (oldbit),ADDR
269 :"dIr" (nr) : "memory");
270 return oldbit;
271}
272
273#if 0 /* Fool kernel-doc since it doesn't do macros yet */
274/**
275 * test_bit - Determine whether a bit is set
276 * @nr: bit number to test
277 * @addr: Address to start counting from
278 */
279static int test_bit(int nr, const volatile void *addr);
280#endif
281
282static inline int constant_test_bit(int nr, const volatile void *addr)
283{
284 return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
285}
286
287static inline int variable_test_bit(int nr, volatile const void *addr)
288{
289 int oldbit;
290
291 __asm__ __volatile__(
292 "btl %2,%1\n\tsbbl %0,%0"
293 :"=r" (oldbit)
294 :"m" (*(volatile long *)addr),"dIr" (nr));
295 return oldbit;
296}
297
298#define test_bit(nr,addr) \
299(__builtin_constant_p(nr) ? \
300 constant_test_bit((nr),(addr)) : \
301 variable_test_bit((nr),(addr)))
302
303#undef ADDR
304
305extern long find_first_zero_bit(const unsigned long *addr, unsigned long size); 8extern long find_first_zero_bit(const unsigned long *addr, unsigned long size);
306extern long find_next_zero_bit(const unsigned long *addr, long size, long offset); 9extern long find_next_zero_bit(const unsigned long *addr, long size, long offset);
307extern long find_first_bit(const unsigned long *addr, unsigned long size); 10extern long find_first_bit(const unsigned long *addr, unsigned long size);
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index 19f3ddf2df4b..51151356840f 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -54,13 +54,14 @@ struct sys_desc_table {
54}; 54};
55 55
56struct efi_info { 56struct efi_info {
57 __u32 _pad1; 57 __u32 efi_loader_signature;
58 __u32 efi_systab; 58 __u32 efi_systab;
59 __u32 efi_memdesc_size; 59 __u32 efi_memdesc_size;
60 __u32 efi_memdesc_version; 60 __u32 efi_memdesc_version;
61 __u32 efi_memmap; 61 __u32 efi_memmap;
62 __u32 efi_memmap_size; 62 __u32 efi_memmap_size;
63 __u32 _pad2[2]; 63 __u32 efi_systab_hi;
64 __u32 efi_memmap_hi;
64}; 65};
65 66
66/* The so-called "zeropage" */ 67/* The so-called "zeropage" */
diff --git a/include/asm-x86/bug.h b/include/asm-x86/bug.h
index fd8bdc639c48..8d477a201392 100644
--- a/include/asm-x86/bug.h
+++ b/include/asm-x86/bug.h
@@ -33,9 +33,6 @@
33 } while(0) 33 } while(0)
34#endif 34#endif
35 35
36void out_of_line_bug(void);
37#else /* CONFIG_BUG */
38static inline void out_of_line_bug(void) { }
39#endif /* !CONFIG_BUG */ 36#endif /* !CONFIG_BUG */
40 37
41#include <asm-generic/bug.h> 38#include <asm-generic/bug.h>
diff --git a/include/asm-x86/bugs.h b/include/asm-x86/bugs.h
index aac8317420af..3fcc30dc0731 100644
--- a/include/asm-x86/bugs.h
+++ b/include/asm-x86/bugs.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_BUGS_H 1#ifndef _ASM_X86_BUGS_H
2#define _ASM_X86_BUGS_H 2#define _ASM_X86_BUGS_H
3 3
4void check_bugs(void); 4extern void check_bugs(void);
5extern int ppro_with_ram_bug(void);
5 6
6#endif /* _ASM_X86_BUGS_H */ 7#endif /* _ASM_X86_BUGS_H */
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 9411a2d3f19c..8dd8c5e3cc7f 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -24,18 +24,35 @@
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ 24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
25 memcpy(dst, src, len) 25 memcpy(dst, src, len)
26 26
27void global_flush_tlb(void); 27int __deprecated_for_modules change_page_attr(struct page *page, int numpages,
28int change_page_attr(struct page *page, int numpages, pgprot_t prot); 28 pgprot_t prot);
29int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot); 29
30void clflush_cache_range(void *addr, int size); 30int set_pages_uc(struct page *page, int numpages);
31 31int set_pages_wb(struct page *page, int numpages);
32#ifdef CONFIG_DEBUG_PAGEALLOC 32int set_pages_x(struct page *page, int numpages);
33/* internal debugging function */ 33int set_pages_nx(struct page *page, int numpages);
34void kernel_map_pages(struct page *page, int numpages, int enable); 34int set_pages_ro(struct page *page, int numpages);
35#endif 35int set_pages_rw(struct page *page, int numpages);
36
37int set_memory_uc(unsigned long addr, int numpages);
38int set_memory_wb(unsigned long addr, int numpages);
39int set_memory_x(unsigned long addr, int numpages);
40int set_memory_nx(unsigned long addr, int numpages);
41int set_memory_ro(unsigned long addr, int numpages);
42int set_memory_rw(unsigned long addr, int numpages);
43int set_memory_np(unsigned long addr, int numpages);
44
45void clflush_cache_range(void *addr, unsigned int size);
36 46
37#ifdef CONFIG_DEBUG_RODATA 47#ifdef CONFIG_DEBUG_RODATA
38void mark_rodata_ro(void); 48void mark_rodata_ro(void);
39#endif 49#endif
50#ifdef CONFIG_DEBUG_RODATA_TEST
51void rodata_test(void);
52#else
53static inline void rodata_test(void)
54{
55}
56#endif
40 57
41#endif 58#endif
diff --git a/include/asm-x86/calling.h b/include/asm-x86/calling.h
index 6f4f63af96e1..f13e62e2cb3e 100644
--- a/include/asm-x86/calling.h
+++ b/include/asm-x86/calling.h
@@ -1,162 +1,168 @@
1/* 1/*
2 * Some macros to handle stack frames in assembly. 2 * Some macros to handle stack frames in assembly.
3 */ 3 */
4 4
5#define R15 0
6#define R14 8
7#define R13 16
8#define R12 24
9#define RBP 32
10#define RBX 40
5 11
6#define R15 0
7#define R14 8
8#define R13 16
9#define R12 24
10#define RBP 32
11#define RBX 40
12/* arguments: interrupts/non tracing syscalls only save upto here*/ 12/* arguments: interrupts/non tracing syscalls only save upto here*/
13#define R11 48 13#define R11 48
14#define R10 56 14#define R10 56
15#define R9 64 15#define R9 64
16#define R8 72 16#define R8 72
17#define RAX 80 17#define RAX 80
18#define RCX 88 18#define RCX 88
19#define RDX 96 19#define RDX 96
20#define RSI 104 20#define RSI 104
21#define RDI 112 21#define RDI 112
22#define ORIG_RAX 120 /* + error_code */ 22#define ORIG_RAX 120 /* + error_code */
23/* end of arguments */ 23/* end of arguments */
24
24/* cpu exception frame or undefined in case of fast syscall. */ 25/* cpu exception frame or undefined in case of fast syscall. */
25#define RIP 128 26#define RIP 128
26#define CS 136 27#define CS 136
27#define EFLAGS 144 28#define EFLAGS 144
28#define RSP 152 29#define RSP 152
29#define SS 160 30#define SS 160
30#define ARGOFFSET R11 31
31#define SWFRAME ORIG_RAX 32#define ARGOFFSET R11
33#define SWFRAME ORIG_RAX
32 34
33 .macro SAVE_ARGS addskip=0,norcx=0,nor891011=0 35 .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
34 subq $9*8+\addskip,%rsp 36 subq $9*8+\addskip, %rsp
35 CFI_ADJUST_CFA_OFFSET 9*8+\addskip 37 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
36 movq %rdi,8*8(%rsp) 38 movq %rdi, 8*8(%rsp)
37 CFI_REL_OFFSET rdi,8*8 39 CFI_REL_OFFSET rdi, 8*8
38 movq %rsi,7*8(%rsp) 40 movq %rsi, 7*8(%rsp)
39 CFI_REL_OFFSET rsi,7*8 41 CFI_REL_OFFSET rsi, 7*8
40 movq %rdx,6*8(%rsp) 42 movq %rdx, 6*8(%rsp)
41 CFI_REL_OFFSET rdx,6*8 43 CFI_REL_OFFSET rdx, 6*8
42 .if \norcx 44 .if \norcx
43 .else 45 .else
44 movq %rcx,5*8(%rsp) 46 movq %rcx, 5*8(%rsp)
45 CFI_REL_OFFSET rcx,5*8 47 CFI_REL_OFFSET rcx, 5*8
46 .endif 48 .endif
47 movq %rax,4*8(%rsp) 49 movq %rax, 4*8(%rsp)
48 CFI_REL_OFFSET rax,4*8 50 CFI_REL_OFFSET rax, 4*8
49 .if \nor891011 51 .if \nor891011
50 .else 52 .else
51 movq %r8,3*8(%rsp) 53 movq %r8, 3*8(%rsp)
52 CFI_REL_OFFSET r8,3*8 54 CFI_REL_OFFSET r8, 3*8
53 movq %r9,2*8(%rsp) 55 movq %r9, 2*8(%rsp)
54 CFI_REL_OFFSET r9,2*8 56 CFI_REL_OFFSET r9, 2*8
55 movq %r10,1*8(%rsp) 57 movq %r10, 1*8(%rsp)
56 CFI_REL_OFFSET r10,1*8 58 CFI_REL_OFFSET r10, 1*8
57 movq %r11,(%rsp) 59 movq %r11, (%rsp)
58 CFI_REL_OFFSET r11,0*8 60 CFI_REL_OFFSET r11, 0*8
59 .endif 61 .endif
60 .endm 62 .endm
61 63
62#define ARG_SKIP 9*8 64#define ARG_SKIP 9*8
63 .macro RESTORE_ARGS skiprax=0,addskip=0,skiprcx=0,skipr11=0,skipr8910=0,skiprdx=0 65
66 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
67 skipr8910=0, skiprdx=0
64 .if \skipr11 68 .if \skipr11
65 .else 69 .else
66 movq (%rsp),%r11 70 movq (%rsp), %r11
67 CFI_RESTORE r11 71 CFI_RESTORE r11
68 .endif 72 .endif
69 .if \skipr8910 73 .if \skipr8910
70 .else 74 .else
71 movq 1*8(%rsp),%r10 75 movq 1*8(%rsp), %r10
72 CFI_RESTORE r10 76 CFI_RESTORE r10
73 movq 2*8(%rsp),%r9 77 movq 2*8(%rsp), %r9
74 CFI_RESTORE r9 78 CFI_RESTORE r9
75 movq 3*8(%rsp),%r8 79 movq 3*8(%rsp), %r8
76 CFI_RESTORE r8 80 CFI_RESTORE r8
77 .endif 81 .endif
78 .if \skiprax 82 .if \skiprax
79 .else 83 .else
80 movq 4*8(%rsp),%rax 84 movq 4*8(%rsp), %rax
81 CFI_RESTORE rax 85 CFI_RESTORE rax
82 .endif 86 .endif
83 .if \skiprcx 87 .if \skiprcx
84 .else 88 .else
85 movq 5*8(%rsp),%rcx 89 movq 5*8(%rsp), %rcx
86 CFI_RESTORE rcx 90 CFI_RESTORE rcx
87 .endif 91 .endif
88 .if \skiprdx 92 .if \skiprdx
89 .else 93 .else
90 movq 6*8(%rsp),%rdx 94 movq 6*8(%rsp), %rdx
91 CFI_RESTORE rdx 95 CFI_RESTORE rdx
92 .endif 96 .endif
93 movq 7*8(%rsp),%rsi 97 movq 7*8(%rsp), %rsi
94 CFI_RESTORE rsi 98 CFI_RESTORE rsi
95 movq 8*8(%rsp),%rdi 99 movq 8*8(%rsp), %rdi
96 CFI_RESTORE rdi 100 CFI_RESTORE rdi
97 .if ARG_SKIP+\addskip > 0 101 .if ARG_SKIP+\addskip > 0
98 addq $ARG_SKIP+\addskip,%rsp 102 addq $ARG_SKIP+\addskip, %rsp
99 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) 103 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
100 .endif 104 .endif
101 .endm 105 .endm
102 106
103 .macro LOAD_ARGS offset 107 .macro LOAD_ARGS offset
104 movq \offset(%rsp),%r11 108 movq \offset(%rsp), %r11
105 movq \offset+8(%rsp),%r10 109 movq \offset+8(%rsp), %r10
106 movq \offset+16(%rsp),%r9 110 movq \offset+16(%rsp), %r9
107 movq \offset+24(%rsp),%r8 111 movq \offset+24(%rsp), %r8
108 movq \offset+40(%rsp),%rcx 112 movq \offset+40(%rsp), %rcx
109 movq \offset+48(%rsp),%rdx 113 movq \offset+48(%rsp), %rdx
110 movq \offset+56(%rsp),%rsi 114 movq \offset+56(%rsp), %rsi
111 movq \offset+64(%rsp),%rdi 115 movq \offset+64(%rsp), %rdi
112 movq \offset+72(%rsp),%rax 116 movq \offset+72(%rsp), %rax
113 .endm 117 .endm
114 118
115#define REST_SKIP 6*8 119#define REST_SKIP 6*8
120
116 .macro SAVE_REST 121 .macro SAVE_REST
117 subq $REST_SKIP,%rsp 122 subq $REST_SKIP, %rsp
118 CFI_ADJUST_CFA_OFFSET REST_SKIP 123 CFI_ADJUST_CFA_OFFSET REST_SKIP
119 movq %rbx,5*8(%rsp) 124 movq %rbx, 5*8(%rsp)
120 CFI_REL_OFFSET rbx,5*8 125 CFI_REL_OFFSET rbx, 5*8
121 movq %rbp,4*8(%rsp) 126 movq %rbp, 4*8(%rsp)
122 CFI_REL_OFFSET rbp,4*8 127 CFI_REL_OFFSET rbp, 4*8
123 movq %r12,3*8(%rsp) 128 movq %r12, 3*8(%rsp)
124 CFI_REL_OFFSET r12,3*8 129 CFI_REL_OFFSET r12, 3*8
125 movq %r13,2*8(%rsp) 130 movq %r13, 2*8(%rsp)
126 CFI_REL_OFFSET r13,2*8 131 CFI_REL_OFFSET r13, 2*8
127 movq %r14,1*8(%rsp) 132 movq %r14, 1*8(%rsp)
128 CFI_REL_OFFSET r14,1*8 133 CFI_REL_OFFSET r14, 1*8
129 movq %r15,(%rsp) 134 movq %r15, (%rsp)
130 CFI_REL_OFFSET r15,0*8 135 CFI_REL_OFFSET r15, 0*8
131 .endm 136 .endm
132 137
133 .macro RESTORE_REST 138 .macro RESTORE_REST
134 movq (%rsp),%r15 139 movq (%rsp), %r15
135 CFI_RESTORE r15 140 CFI_RESTORE r15
136 movq 1*8(%rsp),%r14 141 movq 1*8(%rsp), %r14
137 CFI_RESTORE r14 142 CFI_RESTORE r14
138 movq 2*8(%rsp),%r13 143 movq 2*8(%rsp), %r13
139 CFI_RESTORE r13 144 CFI_RESTORE r13
140 movq 3*8(%rsp),%r12 145 movq 3*8(%rsp), %r12
141 CFI_RESTORE r12 146 CFI_RESTORE r12
142 movq 4*8(%rsp),%rbp 147 movq 4*8(%rsp), %rbp
143 CFI_RESTORE rbp 148 CFI_RESTORE rbp
144 movq 5*8(%rsp),%rbx 149 movq 5*8(%rsp), %rbx
145 CFI_RESTORE rbx 150 CFI_RESTORE rbx
146 addq $REST_SKIP,%rsp 151 addq $REST_SKIP, %rsp
147 CFI_ADJUST_CFA_OFFSET -(REST_SKIP) 152 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
148 .endm 153 .endm
149 154
150 .macro SAVE_ALL 155 .macro SAVE_ALL
151 SAVE_ARGS 156 SAVE_ARGS
152 SAVE_REST 157 SAVE_REST
153 .endm 158 .endm
154 159
155 .macro RESTORE_ALL addskip=0 160 .macro RESTORE_ALL addskip=0
156 RESTORE_REST 161 RESTORE_REST
157 RESTORE_ARGS 0,\addskip 162 RESTORE_ARGS 0, \addskip
158 .endm 163 .endm
159 164
160 .macro icebp 165 .macro icebp
161 .byte 0xf1 166 .byte 0xf1
162 .endm 167 .endm
168
diff --git a/include/asm-x86/checksum_64.h b/include/asm-x86/checksum_64.h
index 419fe88a0342..e5f79997decc 100644
--- a/include/asm-x86/checksum_64.h
+++ b/include/asm-x86/checksum_64.h
@@ -4,7 +4,7 @@
4/* 4/*
5 * Checksums for x86-64 5 * Checksums for x86-64
6 * Copyright 2002 by Andi Kleen, SuSE Labs 6 * Copyright 2002 by Andi Kleen, SuSE Labs
7 * with some code from asm-i386/checksum.h 7 * with some code from asm-x86/checksum.h
8 */ 8 */
9 9
10#include <linux/compiler.h> 10#include <linux/compiler.h>
diff --git a/include/asm-x86/cmpxchg_32.h b/include/asm-x86/cmpxchg_32.h
index f86ede28f6dc..cea1dae288a7 100644
--- a/include/asm-x86/cmpxchg_32.h
+++ b/include/asm-x86/cmpxchg_32.h
@@ -105,15 +105,24 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
105 105
106#ifdef CONFIG_X86_CMPXCHG 106#ifdef CONFIG_X86_CMPXCHG
107#define __HAVE_ARCH_CMPXCHG 1 107#define __HAVE_ARCH_CMPXCHG 1
108#define cmpxchg(ptr,o,n)\ 108#define cmpxchg(ptr, o, n) \
109 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ 109 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
110 (unsigned long)(n),sizeof(*(ptr)))) 110 (unsigned long)(n), sizeof(*(ptr))))
111#define sync_cmpxchg(ptr,o,n)\ 111#define sync_cmpxchg(ptr, o, n) \
112 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ 112 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
113 (unsigned long)(n),sizeof(*(ptr)))) 113 (unsigned long)(n), sizeof(*(ptr))))
114#define cmpxchg_local(ptr,o,n)\ 114#define cmpxchg_local(ptr, o, n) \
115 ((__typeof__(*(ptr)))__cmpxchg_local((ptr),(unsigned long)(o),\ 115 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
116 (unsigned long)(n),sizeof(*(ptr)))) 116 (unsigned long)(n), sizeof(*(ptr))))
117#endif
118
119#ifdef CONFIG_X86_CMPXCHG64
120#define cmpxchg64(ptr, o, n) \
121 ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
122 (unsigned long long)(n)))
123#define cmpxchg64_local(ptr, o, n) \
124 ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o),\
125 (unsigned long long)(n)))
117#endif 126#endif
118 127
119static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 128static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -203,6 +212,34 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
203 return old; 212 return old;
204} 213}
205 214
215static inline unsigned long long __cmpxchg64(volatile void *ptr,
216 unsigned long long old, unsigned long long new)
217{
218 unsigned long long prev;
219 __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
220 : "=A"(prev)
221 : "b"((unsigned long)new),
222 "c"((unsigned long)(new >> 32)),
223 "m"(*__xg(ptr)),
224 "0"(old)
225 : "memory");
226 return prev;
227}
228
229static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
230 unsigned long long old, unsigned long long new)
231{
232 unsigned long long prev;
233 __asm__ __volatile__("cmpxchg8b %3"
234 : "=A"(prev)
235 : "b"((unsigned long)new),
236 "c"((unsigned long)(new >> 32)),
237 "m"(*__xg(ptr)),
238 "0"(old)
239 : "memory");
240 return prev;
241}
242
206#ifndef CONFIG_X86_CMPXCHG 243#ifndef CONFIG_X86_CMPXCHG
207/* 244/*
208 * Building a kernel capable running on 80386. It may be necessary to 245 * Building a kernel capable running on 80386. It may be necessary to
@@ -228,7 +265,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
228 return old; 265 return old;
229} 266}
230 267
231#define cmpxchg(ptr,o,n) \ 268#define cmpxchg(ptr, o, n) \
232({ \ 269({ \
233 __typeof__(*(ptr)) __ret; \ 270 __typeof__(*(ptr)) __ret; \
234 if (likely(boot_cpu_data.x86 > 3)) \ 271 if (likely(boot_cpu_data.x86 > 3)) \
@@ -239,7 +276,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
239 (unsigned long)(n), sizeof(*(ptr))); \ 276 (unsigned long)(n), sizeof(*(ptr))); \
240 __ret; \ 277 __ret; \
241}) 278})
242#define cmpxchg_local(ptr,o,n) \ 279#define cmpxchg_local(ptr, o, n) \
243({ \ 280({ \
244 __typeof__(*(ptr)) __ret; \ 281 __typeof__(*(ptr)) __ret; \
245 if (likely(boot_cpu_data.x86 > 3)) \ 282 if (likely(boot_cpu_data.x86 > 3)) \
@@ -252,38 +289,37 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
252}) 289})
253#endif 290#endif
254 291
255static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, 292#ifndef CONFIG_X86_CMPXCHG64
256 unsigned long long new) 293/*
257{ 294 * Building a kernel capable running on 80386 and 80486. It may be necessary
258 unsigned long long prev; 295 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
259 __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" 296 */
260 : "=A"(prev)
261 : "b"((unsigned long)new),
262 "c"((unsigned long)(new >> 32)),
263 "m"(*__xg(ptr)),
264 "0"(old)
265 : "memory");
266 return prev;
267}
268 297
269static inline unsigned long long __cmpxchg64_local(volatile void *ptr, 298extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
270 unsigned long long old, unsigned long long new) 299
271{ 300#define cmpxchg64(ptr, o, n) \
272 unsigned long long prev; 301({ \
273 __asm__ __volatile__("cmpxchg8b %3" 302 __typeof__(*(ptr)) __ret; \
274 : "=A"(prev) 303 if (likely(boot_cpu_data.x86 > 4)) \
275 : "b"((unsigned long)new), 304 __ret = __cmpxchg64((ptr), (unsigned long long)(o), \
276 "c"((unsigned long)(new >> 32)), 305 (unsigned long long)(n)); \
277 "m"(*__xg(ptr)), 306 else \
278 "0"(old) 307 __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
279 : "memory"); 308 (unsigned long long)(n)); \
280 return prev; 309 __ret; \
281} 310})
311#define cmpxchg64_local(ptr, o, n) \
312({ \
313 __typeof__(*(ptr)) __ret; \
314 if (likely(boot_cpu_data.x86 > 4)) \
315 __ret = __cmpxchg64_local((ptr), (unsigned long long)(o), \
316 (unsigned long long)(n)); \
317 else \
318 __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
319 (unsigned long long)(n)); \
320 __ret; \
321})
322
323#endif
282 324
283#define cmpxchg64(ptr,o,n)\
284 ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
285 (unsigned long long)(n)))
286#define cmpxchg64_local(ptr,o,n)\
287 ((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
288 (unsigned long long)(n)))
289#endif 325#endif
diff --git a/include/asm-x86/compat.h b/include/asm-x86/compat.h
index 66ba7987184a..b270ee04959e 100644
--- a/include/asm-x86/compat.h
+++ b/include/asm-x86/compat.h
@@ -207,7 +207,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
207static __inline__ void __user *compat_alloc_user_space(long len) 207static __inline__ void __user *compat_alloc_user_space(long len)
208{ 208{
209 struct pt_regs *regs = task_pt_regs(current); 209 struct pt_regs *regs = task_pt_regs(current);
210 return (void __user *)regs->rsp - len; 210 return (void __user *)regs->sp - len;
211} 211}
212 212
213static inline int is_compat_task(void) 213static inline int is_compat_task(void)
diff --git a/include/asm-x86/cpu.h b/include/asm-x86/cpu.h
index b1bc7b1b64b0..85ece5f10e9e 100644
--- a/include/asm-x86/cpu.h
+++ b/include/asm-x86/cpu.h
@@ -7,7 +7,7 @@
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9 9
10struct i386_cpu { 10struct x86_cpu {
11 struct cpu cpu; 11 struct cpu cpu;
12}; 12};
13extern int arch_register_cpu(int num); 13extern int arch_register_cpu(int num);
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index b7160a4598d7..3fb7dfa7fc91 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -1,5 +1,207 @@
1#ifdef CONFIG_X86_32 1/*
2# include "cpufeature_32.h" 2 * Defines x86 CPU feature bits
3 */
4#ifndef _ASM_X86_CPUFEATURE_H
5#define _ASM_X86_CPUFEATURE_H
6
7#ifndef __ASSEMBLY__
8#include <linux/bitops.h>
9#endif
10#include <asm/required-features.h>
11
12#define NCAPINTS 8 /* N 32-bit words worth of info */
13
14/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
15#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
16#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
17#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
18#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
19#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
20#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */
21#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
22#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
23#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
24#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
25#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
26#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
27#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
28#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
29#define X86_FEATURE_CMOV (0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
30#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
31#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
32#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
33#define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */
34#define X86_FEATURE_DS (0*32+21) /* Debug Store */
35#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
36#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
37#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
38 /* of FPU context), and CR4.OSFXSR available */
39#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
40#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
41#define X86_FEATURE_SELFSNOOP (0*32+27) /* CPU self snoop */
42#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
43#define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */
44#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
45
46/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
47/* Don't duplicate feature flags which are redundant with Intel! */
48#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
49#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
50#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
51#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
52#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
53#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
54#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
55#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
56
57/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
58#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
59#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
60#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
61
62/* Other features, Linux-defined mapping, word 3 */
63/* This range is used for feature bits which conflict or are synthesized */
64#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
65#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
66#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
67#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
68/* cpu types for specific tunings: */
69#define X86_FEATURE_K8 (3*32+ 4) /* Opteron, Athlon64 */
70#define X86_FEATURE_K7 (3*32+ 5) /* Athlon */
71#define X86_FEATURE_P3 (3*32+ 6) /* P3 */
72#define X86_FEATURE_P4 (3*32+ 7) /* P4 */
73#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
74#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
75#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
76#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
77#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
78#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
79/* 14 free */
80/* 15 free */
81#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */
82#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
83#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
84
85/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
86#define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */
87#define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */
88#define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */
89#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
90#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
91#define X86_FEATURE_CID (4*32+10) /* Context ID */
92#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
93#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
94#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
95
96/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
97#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
98#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* on-CPU RNG enabled */
99#define X86_FEATURE_XCRYPT (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
100#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* on-CPU crypto enabled */
101#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
102#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
103#define X86_FEATURE_PHE (5*32+ 10) /* PadLock Hash Engine */
104#define X86_FEATURE_PHE_EN (5*32+ 11) /* PHE enabled */
105#define X86_FEATURE_PMM (5*32+ 12) /* PadLock Montgomery Multiplier */
106#define X86_FEATURE_PMM_EN (5*32+ 13) /* PMM enabled */
107
108/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
109#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
110#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
111
112/*
113 * Auxiliary flags: Linux defined - For features scattered in various
114 * CPUID levels like 0x6, 0xA etc
115 */
116#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
117
118#define cpu_has(c, bit) \
119 (__builtin_constant_p(bit) && \
120 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
121 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
122 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
123 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
124 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
125 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
126 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
127 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
128 ? 1 : \
129 test_bit(bit, (unsigned long *)((c)->x86_capability)))
130#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
131
132#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
133#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
134#define setup_clear_cpu_cap(bit) do { \
135 clear_cpu_cap(&boot_cpu_data, bit); \
136 set_bit(bit, cleared_cpu_caps); \
137} while (0)
138#define setup_force_cpu_cap(bit) do { \
139 set_cpu_cap(&boot_cpu_data, bit); \
140 clear_bit(bit, cleared_cpu_caps); \
141} while (0)
142
143#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
144#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
145#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
146#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
147#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
148#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
149#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
150#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
151#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
152#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
153#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
154#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
155#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
156#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
157#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
158#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
159#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
160#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
161#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
162#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
163#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
164#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
165#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
166#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
167#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
168#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
169#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
170#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
171#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
172#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
173#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
174#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
175#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
176#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
177#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
178
179#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
180# define cpu_has_invlpg 1
3#else 181#else
4# include "cpufeature_64.h" 182# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
5#endif 183#endif
184
185#ifdef CONFIG_X86_64
186
187#undef cpu_has_vme
188#define cpu_has_vme 0
189
190#undef cpu_has_pae
191#define cpu_has_pae ___BUG___
192
193#undef cpu_has_mp
194#define cpu_has_mp 1
195
196#undef cpu_has_k6_mtrr
197#define cpu_has_k6_mtrr 0
198
199#undef cpu_has_cyrix_arr
200#define cpu_has_cyrix_arr 0
201
202#undef cpu_has_centaur_mcr
203#define cpu_has_centaur_mcr 0
204
205#endif /* CONFIG_X86_64 */
206
207#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/include/asm-x86/cpufeature_32.h b/include/asm-x86/cpufeature_32.h
deleted file mode 100644
index f17e688dfb05..000000000000
--- a/include/asm-x86/cpufeature_32.h
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * cpufeature.h
3 *
4 * Defines x86 CPU feature bits
5 */
6
7#ifndef __ASM_I386_CPUFEATURE_H
8#define __ASM_I386_CPUFEATURE_H
9
10#ifndef __ASSEMBLY__
11#include <linux/bitops.h>
12#endif
13#include <asm/required-features.h>
14
15#define NCAPINTS 8 /* N 32-bit words worth of info */
16
17/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
18#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
19#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
20#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
21#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
29#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
30#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
31#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
32#define X86_FEATURE_CMOV (0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
33#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
34#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
35#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
36#define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */
37#define X86_FEATURE_DS (0*32+21) /* Debug Store */
38#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
39#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
40#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
41 /* of FPU context), and CR4.OSFXSR available */
42#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
43#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
44#define X86_FEATURE_SELFSNOOP (0*32+27) /* CPU self snoop */
45#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
46#define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */
47#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
48
49/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
50/* Don't duplicate feature flags which are redundant with Intel! */
51#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
52#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
53#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
54#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
55#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
56#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
57#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
58#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
59
60/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
61#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
62#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
63#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
64
65/* Other features, Linux-defined mapping, word 3 */
66/* This range is used for feature bits which conflict or are synthesized */
67#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
68#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
69#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
70#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
71/* cpu types for specific tunings: */
72#define X86_FEATURE_K8 (3*32+ 4) /* Opteron, Athlon64 */
73#define X86_FEATURE_K7 (3*32+ 5) /* Athlon */
74#define X86_FEATURE_P3 (3*32+ 6) /* P3 */
75#define X86_FEATURE_P4 (3*32+ 7) /* P4 */
76#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
77#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
78#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
79#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
80#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
81#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
82/* 14 free */
83#define X86_FEATURE_SYNC_RDTSC (3*32+15) /* RDTSC synchronizes the CPU */
84#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */
85
86/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
87#define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */
88#define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */
89#define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */
90#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
91#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
92#define X86_FEATURE_CID (4*32+10) /* Context ID */
93#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
94#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
95#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
96
97/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
98#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
99#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* on-CPU RNG enabled */
100#define X86_FEATURE_XCRYPT (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
101#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* on-CPU crypto enabled */
102#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
103#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
104#define X86_FEATURE_PHE (5*32+ 10) /* PadLock Hash Engine */
105#define X86_FEATURE_PHE_EN (5*32+ 11) /* PHE enabled */
106#define X86_FEATURE_PMM (5*32+ 12) /* PadLock Montgomery Multiplier */
107#define X86_FEATURE_PMM_EN (5*32+ 13) /* PMM enabled */
108
109/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
110#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
111#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
112
113/*
114 * Auxiliary flags: Linux defined - For features scattered in various
115 * CPUID levels like 0x6, 0xA etc
116 */
117#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
118
119#define cpu_has(c, bit) \
120 (__builtin_constant_p(bit) && \
121 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
122 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
123 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
124 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
125 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
126 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
127 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
128 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
129 ? 1 : \
130 test_bit(bit, (c)->x86_capability))
131#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
132
133#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
134#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
135#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
136#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
137#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
138#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
139#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
140#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
141#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
142#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
143#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
144#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
145#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
146#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
147#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
148#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
149#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
150#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
151#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
152#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
153#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
154#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
155#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
156#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
157#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
158#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
159#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
160#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
161#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
162#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
163#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
164#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
165#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
166#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
167#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
168
169#endif /* __ASM_I386_CPUFEATURE_H */
170
171/*
172 * Local Variables:
173 * mode:c
174 * comment-column:42
175 * End:
176 */
diff --git a/include/asm-x86/cpufeature_64.h b/include/asm-x86/cpufeature_64.h
deleted file mode 100644
index e18496b7b850..000000000000
--- a/include/asm-x86/cpufeature_64.h
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * cpufeature_32.h
3 *
4 * Defines x86 CPU feature bits
5 */
6
7#ifndef __ASM_X8664_CPUFEATURE_H
8#define __ASM_X8664_CPUFEATURE_H
9
10#include "cpufeature_32.h"
11
12#undef cpu_has_vme
13#define cpu_has_vme 0
14
15#undef cpu_has_pae
16#define cpu_has_pae ___BUG___
17
18#undef cpu_has_mp
19#define cpu_has_mp 1 /* XXX */
20
21#undef cpu_has_k6_mtrr
22#define cpu_has_k6_mtrr 0
23
24#undef cpu_has_cyrix_arr
25#define cpu_has_cyrix_arr 0
26
27#undef cpu_has_centaur_mcr
28#define cpu_has_centaur_mcr 0
29
30#endif /* __ASM_X8664_CPUFEATURE_H */
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 6065c5092265..5b6a05d3a771 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -1,5 +1,381 @@
1#ifndef _ASM_DESC_H_
2#define _ASM_DESC_H_
3
4#ifndef __ASSEMBLY__
5#include <asm/desc_defs.h>
6#include <asm/ldt.h>
7#include <asm/mmu.h>
8#include <linux/smp.h>
9
10static inline void fill_ldt(struct desc_struct *desc,
11 const struct user_desc *info)
12{
13 desc->limit0 = info->limit & 0x0ffff;
14 desc->base0 = info->base_addr & 0x0000ffff;
15
16 desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
17 desc->type = (info->read_exec_only ^ 1) << 1;
18 desc->type |= info->contents << 2;
19 desc->s = 1;
20 desc->dpl = 0x3;
21 desc->p = info->seg_not_present ^ 1;
22 desc->limit = (info->limit & 0xf0000) >> 16;
23 desc->avl = info->useable;
24 desc->d = info->seg_32bit;
25 desc->g = info->limit_in_pages;
26 desc->base2 = (info->base_addr & 0xff000000) >> 24;
27}
28
29extern struct desc_ptr idt_descr;
30extern gate_desc idt_table[];
31
32#ifdef CONFIG_X86_64
33extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
34extern struct desc_ptr cpu_gdt_descr[];
35/* the cpu gdt accessor */
36#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
37
38static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
39 unsigned dpl, unsigned ist, unsigned seg)
40{
41 gate->offset_low = PTR_LOW(func);
42 gate->segment = __KERNEL_CS;
43 gate->ist = ist;
44 gate->p = 1;
45 gate->dpl = dpl;
46 gate->zero0 = 0;
47 gate->zero1 = 0;
48 gate->type = type;
49 gate->offset_middle = PTR_MIDDLE(func);
50 gate->offset_high = PTR_HIGH(func);
51}
52
53#else
54struct gdt_page {
55 struct desc_struct gdt[GDT_ENTRIES];
56} __attribute__((aligned(PAGE_SIZE)));
57DECLARE_PER_CPU(struct gdt_page, gdt_page);
58
59static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
60{
61 return per_cpu(gdt_page, cpu).gdt;
62}
63
64static inline void pack_gate(gate_desc *gate, unsigned char type,
65 unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
66
67{
68 gate->a = (seg << 16) | (base & 0xffff);
69 gate->b = (base & 0xffff0000) |
70 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
71}
72
73#endif
74
75static inline int desc_empty(const void *ptr)
76{
77 const u32 *desc = ptr;
78 return !(desc[0] | desc[1]);
79}
80
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#else
84#define load_TR_desc() native_load_tr_desc()
85#define load_gdt(dtr) native_load_gdt(dtr)
86#define load_idt(dtr) native_load_idt(dtr)
87#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
88#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
89
90#define store_gdt(dtr) native_store_gdt(dtr)
91#define store_idt(dtr) native_store_idt(dtr)
92#define store_tr(tr) (tr = native_store_tr())
93#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
94
95#define load_TLS(t, cpu) native_load_tls(t, cpu)
96#define set_ldt native_set_ldt
97
98#define write_ldt_entry(dt, entry, desc) \
99 native_write_ldt_entry(dt, entry, desc)
100#define write_gdt_entry(dt, entry, desc, type) \
101 native_write_gdt_entry(dt, entry, desc, type)
102#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
103#endif
104
105static inline void native_write_idt_entry(gate_desc *idt, int entry,
106 const gate_desc *gate)
107{
108 memcpy(&idt[entry], gate, sizeof(*gate));
109}
110
111static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
112 const void *desc)
113{
114 memcpy(&ldt[entry], desc, 8);
115}
116
117static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
118 const void *desc, int type)
119{
120 unsigned int size;
121 switch (type) {
122 case DESC_TSS:
123 size = sizeof(tss_desc);
124 break;
125 case DESC_LDT:
126 size = sizeof(ldt_desc);
127 break;
128 default:
129 size = sizeof(struct desc_struct);
130 break;
131 }
132 memcpy(&gdt[entry], desc, size);
133}
134
135static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
136 unsigned long limit, unsigned char type,
137 unsigned char flags)
138{
139 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
140 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
141 (limit & 0x000f0000) | ((type & 0xff) << 8) |
142 ((flags & 0xf) << 20);
143 desc->p = 1;
144}
145
146
147static inline void set_tssldt_descriptor(void *d, unsigned long addr,
148 unsigned type, unsigned size)
149{
150#ifdef CONFIG_X86_64
151 struct ldttss_desc64 *desc = d;
152 memset(desc, 0, sizeof(*desc));
153 desc->limit0 = size & 0xFFFF;
154 desc->base0 = PTR_LOW(addr);
155 desc->base1 = PTR_MIDDLE(addr) & 0xFF;
156 desc->type = type;
157 desc->p = 1;
158 desc->limit1 = (size >> 16) & 0xF;
159 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
160 desc->base3 = PTR_HIGH(addr);
161#else
162
163 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
164#endif
165}
166
167static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
168{
169 struct desc_struct *d = get_cpu_gdt_table(cpu);
170 tss_desc tss;
171
172 /*
173 * sizeof(unsigned long) coming from an extra "long" at the end
174 * of the iobitmap. See tss_struct definition in processor.h
175 *
176 * -1? seg base+limit should be pointing to the address of the
177 * last valid byte
178 */
179 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
180 IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
181 write_gdt_entry(d, entry, &tss, DESC_TSS);
182}
183
184#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
185
186static inline void native_set_ldt(const void *addr, unsigned int entries)
187{
188 if (likely(entries == 0))
189 __asm__ __volatile__("lldt %w0"::"q" (0));
190 else {
191 unsigned cpu = smp_processor_id();
192 ldt_desc ldt;
193
194 set_tssldt_descriptor(&ldt, (unsigned long)addr,
195 DESC_LDT, entries * sizeof(ldt) - 1);
196 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
197 &ldt, DESC_LDT);
198 __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
199 }
200}
201
202static inline void native_load_tr_desc(void)
203{
204 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
205}
206
207static inline void native_load_gdt(const struct desc_ptr *dtr)
208{
209 asm volatile("lgdt %0"::"m" (*dtr));
210}
211
212static inline void native_load_idt(const struct desc_ptr *dtr)
213{
214 asm volatile("lidt %0"::"m" (*dtr));
215}
216
217static inline void native_store_gdt(struct desc_ptr *dtr)
218{
219 asm volatile("sgdt %0":"=m" (*dtr));
220}
221
222static inline void native_store_idt(struct desc_ptr *dtr)
223{
224 asm volatile("sidt %0":"=m" (*dtr));
225}
226
227static inline unsigned long native_store_tr(void)
228{
229 unsigned long tr;
230 asm volatile("str %0":"=r" (tr));
231 return tr;
232}
233
234static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
235{
236 unsigned int i;
237 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
238
239 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
240 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
241}
242
243#define _LDT_empty(info) (\
244 (info)->base_addr == 0 && \
245 (info)->limit == 0 && \
246 (info)->contents == 0 && \
247 (info)->read_exec_only == 1 && \
248 (info)->seg_32bit == 0 && \
249 (info)->limit_in_pages == 0 && \
250 (info)->seg_not_present == 1 && \
251 (info)->useable == 0)
252
253#ifdef CONFIG_X86_64
254#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
255#else
256#define LDT_empty(info) (_LDT_empty(info))
257#endif
258
259static inline void clear_LDT(void)
260{
261 set_ldt(NULL, 0);
262}
263
264/*
265 * load one particular LDT into the current CPU
266 */
267static inline void load_LDT_nolock(mm_context_t *pc)
268{
269 set_ldt(pc->ldt, pc->size);
270}
271
272static inline void load_LDT(mm_context_t *pc)
273{
274 preempt_disable();
275 load_LDT_nolock(pc);
276 preempt_enable();
277}
278
279static inline unsigned long get_desc_base(const struct desc_struct *desc)
280{
281 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
282}
283
284static inline unsigned long get_desc_limit(const struct desc_struct *desc)
285{
286 return desc->limit0 | (desc->limit << 16);
287}
288
289static inline void _set_gate(int gate, unsigned type, void *addr,
290 unsigned dpl, unsigned ist, unsigned seg)
291{
292 gate_desc s;
293 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
294 /*
295 * does not need to be atomic because it is only done once at
296 * setup time
297 */
298 write_idt_entry(idt_table, gate, &s);
299}
300
301/*
302 * This needs to use 'idt_table' rather than 'idt', and
303 * thus use the _nonmapped_ version of the IDT, as the
304 * Pentium F0 0F bugfix can have resulted in the mapped
305 * IDT being write-protected.
306 */
307static inline void set_intr_gate(unsigned int n, void *addr)
308{
309 BUG_ON((unsigned)n > 0xFF);
310 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
311}
312
313/*
314 * This routine sets up an interrupt gate at directory privilege level 3.
315 */
316static inline void set_system_intr_gate(unsigned int n, void *addr)
317{
318 BUG_ON((unsigned)n > 0xFF);
319 _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
320}
321
322static inline void set_trap_gate(unsigned int n, void *addr)
323{
324 BUG_ON((unsigned)n > 0xFF);
325 _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
326}
327
328static inline void set_system_gate(unsigned int n, void *addr)
329{
330 BUG_ON((unsigned)n > 0xFF);
1#ifdef CONFIG_X86_32 331#ifdef CONFIG_X86_32
2# include "desc_32.h" 332 _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
333#else
334 _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
335#endif
336}
337
338static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
339{
340 BUG_ON((unsigned)n > 0xFF);
341 _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
342}
343
344static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
345{
346 BUG_ON((unsigned)n > 0xFF);
347 _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
348}
349
350static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
351{
352 BUG_ON((unsigned)n > 0xFF);
353 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
354}
355
3#else 356#else
4# include "desc_64.h" 357/*
358 * GET_DESC_BASE reads the descriptor base of the specified segment.
359 *
360 * Args:
361 * idx - descriptor index
362 * gdt - GDT pointer
363 * base - 32bit register to which the base will be written
364 * lo_w - lo word of the "base" register
365 * lo_b - lo byte of the "base" register
366 * hi_b - hi byte of the low word of the "base" register
367 *
368 * Example:
369 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
370 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
371 */
372#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
373 movb idx*8+4(gdt), lo_b; \
374 movb idx*8+7(gdt), hi_b; \
375 shll $16, base; \
376 movw idx*8+2(gdt), lo_w;
377
378
379#endif /* __ASSEMBLY__ */
380
5#endif 381#endif
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
deleted file mode 100644
index c547403f341d..000000000000
--- a/include/asm-x86/desc_32.h
+++ /dev/null
@@ -1,244 +0,0 @@
1#ifndef __ARCH_DESC_H
2#define __ARCH_DESC_H
3
4#include <asm/ldt.h>
5#include <asm/segment.h>
6
7#ifndef __ASSEMBLY__
8
9#include <linux/preempt.h>
10#include <linux/smp.h>
11#include <linux/percpu.h>
12
13#include <asm/mmu.h>
14
15struct Xgt_desc_struct {
16 unsigned short size;
17 unsigned long address __attribute__((packed));
18 unsigned short pad;
19} __attribute__ ((packed));
20
21struct gdt_page
22{
23 struct desc_struct gdt[GDT_ENTRIES];
24} __attribute__((aligned(PAGE_SIZE)));
25DECLARE_PER_CPU(struct gdt_page, gdt_page);
26
27static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
28{
29 return per_cpu(gdt_page, cpu).gdt;
30}
31
32extern struct Xgt_desc_struct idt_descr;
33extern struct desc_struct idt_table[];
34extern void set_intr_gate(unsigned int irq, void * addr);
35
36static inline void pack_descriptor(__u32 *a, __u32 *b,
37 unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
38{
39 *a = ((base & 0xffff) << 16) | (limit & 0xffff);
40 *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
41 (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
42}
43
44static inline void pack_gate(__u32 *a, __u32 *b,
45 unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
46{
47 *a = (seg << 16) | (base & 0xffff);
48 *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
49}
50
51#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
52#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
53#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
54#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
55#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
56#define DESCTYPE_DPL3 0x60 /* DPL-3 */
57#define DESCTYPE_S 0x10 /* !system */
58
59#ifdef CONFIG_PARAVIRT
60#include <asm/paravirt.h>
61#else
62#define load_TR_desc() native_load_tr_desc()
63#define load_gdt(dtr) native_load_gdt(dtr)
64#define load_idt(dtr) native_load_idt(dtr)
65#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
66#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
67
68#define store_gdt(dtr) native_store_gdt(dtr)
69#define store_idt(dtr) native_store_idt(dtr)
70#define store_tr(tr) (tr = native_store_tr())
71#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
72
73#define load_TLS(t, cpu) native_load_tls(t, cpu)
74#define set_ldt native_set_ldt
75
76#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
77#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
78#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
79#endif
80
81static inline void write_dt_entry(struct desc_struct *dt,
82 int entry, u32 entry_low, u32 entry_high)
83{
84 dt[entry].a = entry_low;
85 dt[entry].b = entry_high;
86}
87
88static inline void native_set_ldt(const void *addr, unsigned int entries)
89{
90 if (likely(entries == 0))
91 __asm__ __volatile__("lldt %w0"::"q" (0));
92 else {
93 unsigned cpu = smp_processor_id();
94 __u32 a, b;
95
96 pack_descriptor(&a, &b, (unsigned long)addr,
97 entries * sizeof(struct desc_struct) - 1,
98 DESCTYPE_LDT, 0);
99 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
100 __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
101 }
102}
103
104
105static inline void native_load_tr_desc(void)
106{
107 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
108}
109
110static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
111{
112 asm volatile("lgdt %0"::"m" (*dtr));
113}
114
115static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
116{
117 asm volatile("lidt %0"::"m" (*dtr));
118}
119
120static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
121{
122 asm ("sgdt %0":"=m" (*dtr));
123}
124
125static inline void native_store_idt(struct Xgt_desc_struct *dtr)
126{
127 asm ("sidt %0":"=m" (*dtr));
128}
129
130static inline unsigned long native_store_tr(void)
131{
132 unsigned long tr;
133 asm ("str %0":"=r" (tr));
134 return tr;
135}
136
137static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
138{
139 unsigned int i;
140 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
141
142 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
143 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
144}
145
146static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
147{
148 __u32 a, b;
149 pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
150 write_idt_entry(idt_table, gate, a, b);
151}
152
153static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
154{
155 __u32 a, b;
156 pack_descriptor(&a, &b, (unsigned long)addr,
157 offsetof(struct tss_struct, __cacheline_filler) - 1,
158 DESCTYPE_TSS, 0);
159 write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
160}
161
162
163#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
164
165#define LDT_entry_a(info) \
166 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
167
168#define LDT_entry_b(info) \
169 (((info)->base_addr & 0xff000000) | \
170 (((info)->base_addr & 0x00ff0000) >> 16) | \
171 ((info)->limit & 0xf0000) | \
172 (((info)->read_exec_only ^ 1) << 9) | \
173 ((info)->contents << 10) | \
174 (((info)->seg_not_present ^ 1) << 15) | \
175 ((info)->seg_32bit << 22) | \
176 ((info)->limit_in_pages << 23) | \
177 ((info)->useable << 20) | \
178 0x7000)
179
180#define LDT_empty(info) (\
181 (info)->base_addr == 0 && \
182 (info)->limit == 0 && \
183 (info)->contents == 0 && \
184 (info)->read_exec_only == 1 && \
185 (info)->seg_32bit == 0 && \
186 (info)->limit_in_pages == 0 && \
187 (info)->seg_not_present == 1 && \
188 (info)->useable == 0 )
189
190static inline void clear_LDT(void)
191{
192 set_ldt(NULL, 0);
193}
194
195/*
196 * load one particular LDT into the current CPU
197 */
198static inline void load_LDT_nolock(mm_context_t *pc)
199{
200 set_ldt(pc->ldt, pc->size);
201}
202
203static inline void load_LDT(mm_context_t *pc)
204{
205 preempt_disable();
206 load_LDT_nolock(pc);
207 preempt_enable();
208}
209
210static inline unsigned long get_desc_base(unsigned long *desc)
211{
212 unsigned long base;
213 base = ((desc[0] >> 16) & 0x0000ffff) |
214 ((desc[1] << 16) & 0x00ff0000) |
215 (desc[1] & 0xff000000);
216 return base;
217}
218
219#else /* __ASSEMBLY__ */
220
221/*
222 * GET_DESC_BASE reads the descriptor base of the specified segment.
223 *
224 * Args:
225 * idx - descriptor index
226 * gdt - GDT pointer
227 * base - 32bit register to which the base will be written
228 * lo_w - lo word of the "base" register
229 * lo_b - lo byte of the "base" register
230 * hi_b - hi byte of the low word of the "base" register
231 *
232 * Example:
233 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
234 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
235 */
236#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
237 movb idx*8+4(gdt), lo_b; \
238 movb idx*8+7(gdt), hi_b; \
239 shll $16, base; \
240 movw idx*8+2(gdt), lo_w;
241
242#endif /* !__ASSEMBLY__ */
243
244#endif
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index 7d9c938e69fd..8b137891791f 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -1,204 +1 @@
1/* Written 2000 by Andi Kleen */
2#ifndef __ARCH_DESC_H
3#define __ARCH_DESC_H
4
5#include <linux/threads.h>
6#include <asm/ldt.h>
7
8#ifndef __ASSEMBLY__
9
10#include <linux/string.h>
11#include <linux/smp.h>
12#include <asm/desc_defs.h>
13
14#include <asm/segment.h>
15#include <asm/mmu.h>
16
17extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
18
19#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
20#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
21#define clear_LDT() asm volatile("lldt %w0"::"r" (0))
22
23static inline unsigned long __store_tr(void)
24{
25 unsigned long tr;
26
27 asm volatile ("str %w0":"=r" (tr));
28 return tr;
29}
30
31#define store_tr(tr) (tr) = __store_tr()
32
33/*
34 * This is the ldt that every process will get unless we need
35 * something other than this.
36 */
37extern struct desc_struct default_ldt[];
38extern struct gate_struct idt_table[];
39extern struct desc_ptr cpu_gdt_descr[];
40
41/* the cpu gdt accessor */
42#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
43
44static inline void load_gdt(const struct desc_ptr *ptr)
45{
46 asm volatile("lgdt %w0"::"m" (*ptr));
47}
48
49static inline void store_gdt(struct desc_ptr *ptr)
50{
51 asm("sgdt %w0":"=m" (*ptr));
52}
53
54static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
55{
56 struct gate_struct s;
57 s.offset_low = PTR_LOW(func);
58 s.segment = __KERNEL_CS;
59 s.ist = ist;
60 s.p = 1;
61 s.dpl = dpl;
62 s.zero0 = 0;
63 s.zero1 = 0;
64 s.type = type;
65 s.offset_middle = PTR_MIDDLE(func);
66 s.offset_high = PTR_HIGH(func);
67 /* does not need to be atomic because it is only done once at setup time */
68 memcpy(adr, &s, 16);
69}
70
71static inline void set_intr_gate(int nr, void *func)
72{
73 BUG_ON((unsigned)nr > 0xFF);
74 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
75}
76
77static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
78{
79 BUG_ON((unsigned)nr > 0xFF);
80 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
81}
82
83static inline void set_system_gate(int nr, void *func)
84{
85 BUG_ON((unsigned)nr > 0xFF);
86 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
87}
88
89static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
90{
91 _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
92}
93
94static inline void load_idt(const struct desc_ptr *ptr)
95{
96 asm volatile("lidt %w0"::"m" (*ptr));
97}
98
99static inline void store_idt(struct desc_ptr *dtr)
100{
101 asm("sidt %w0":"=m" (*dtr));
102}
103
104static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
105 unsigned size)
106{
107 struct ldttss_desc d;
108 memset(&d,0,sizeof(d));
109 d.limit0 = size & 0xFFFF;
110 d.base0 = PTR_LOW(tss);
111 d.base1 = PTR_MIDDLE(tss) & 0xFF;
112 d.type = type;
113 d.p = 1;
114 d.limit1 = (size >> 16) & 0xF;
115 d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
116 d.base3 = PTR_HIGH(tss);
117 memcpy(ptr, &d, 16);
118}
119
120static inline void set_tss_desc(unsigned cpu, void *addr)
121{
122 /*
123 * sizeof(unsigned long) coming from an extra "long" at the end
124 * of the iobitmap. See tss_struct definition in processor.h
125 *
126 * -1? seg base+limit should be pointing to the address of the
127 * last valid byte
128 */
129 set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
130 (unsigned long)addr, DESC_TSS,
131 IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
132}
133
134static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
135{
136 set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
137 DESC_LDT, size * 8 - 1);
138}
139
140#define LDT_entry_a(info) \
141 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
142/* Don't allow setting of the lm bit. It is useless anyways because
143 64bit system calls require __USER_CS. */
144#define LDT_entry_b(info) \
145 (((info)->base_addr & 0xff000000) | \
146 (((info)->base_addr & 0x00ff0000) >> 16) | \
147 ((info)->limit & 0xf0000) | \
148 (((info)->read_exec_only ^ 1) << 9) | \
149 ((info)->contents << 10) | \
150 (((info)->seg_not_present ^ 1) << 15) | \
151 ((info)->seg_32bit << 22) | \
152 ((info)->limit_in_pages << 23) | \
153 ((info)->useable << 20) | \
154 /* ((info)->lm << 21) | */ \
155 0x7000)
156
157#define LDT_empty(info) (\
158 (info)->base_addr == 0 && \
159 (info)->limit == 0 && \
160 (info)->contents == 0 && \
161 (info)->read_exec_only == 1 && \
162 (info)->seg_32bit == 0 && \
163 (info)->limit_in_pages == 0 && \
164 (info)->seg_not_present == 1 && \
165 (info)->useable == 0 && \
166 (info)->lm == 0)
167
168static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
169{
170 unsigned int i;
171 u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
172
173 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
174 gdt[i] = t->tls_array[i];
175}
176
177/*
178 * load one particular LDT into the current CPU
179 */
180static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
181{
182 int count = pc->size;
183
184 if (likely(!count)) {
185 clear_LDT();
186 return;
187 }
188
189 set_ldt_desc(cpu, pc->ldt, count);
190 load_LDT_desc();
191}
192
193static inline void load_LDT(mm_context_t *pc)
194{
195 int cpu = get_cpu();
196 load_LDT_nolock(pc, cpu);
197 put_cpu();
198}
199
200extern struct desc_ptr idt_descr;
201
202#endif /* !__ASSEMBLY__ */
203
204#endif
diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h
index 089004070099..e33f078b3e54 100644
--- a/include/asm-x86/desc_defs.h
+++ b/include/asm-x86/desc_defs.h
@@ -11,26 +11,36 @@
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13 13
14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer
20 */
14// 8 byte segment descriptor 21// 8 byte segment descriptor
15struct desc_struct { 22struct desc_struct {
16 u16 limit0; 23 union {
17 u16 base0; 24 struct { unsigned int a, b; };
18 unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; 25 struct {
19 unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; 26 u16 limit0;
20} __attribute__((packed)); 27 u16 base0;
28 unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
29 unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
30 };
21 31
22struct n_desc_struct { 32 };
23 unsigned int a,b; 33} __attribute__((packed));
24};
25 34
26enum { 35enum {
27 GATE_INTERRUPT = 0xE, 36 GATE_INTERRUPT = 0xE,
28 GATE_TRAP = 0xF, 37 GATE_TRAP = 0xF,
29 GATE_CALL = 0xC, 38 GATE_CALL = 0xC,
39 GATE_TASK = 0x5,
30}; 40};
31 41
32// 16byte gate 42// 16byte gate
33struct gate_struct { 43struct gate_struct64 {
34 u16 offset_low; 44 u16 offset_low;
35 u16 segment; 45 u16 segment;
36 unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; 46 unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
@@ -39,17 +49,18 @@ struct gate_struct {
39 u32 zero1; 49 u32 zero1;
40} __attribute__((packed)); 50} __attribute__((packed));
41 51
42#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 52#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
43#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF) 53#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
44#define PTR_HIGH(x) ((unsigned long)(x) >> 32) 54#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
45 55
46enum { 56enum {
47 DESC_TSS = 0x9, 57 DESC_TSS = 0x9,
48 DESC_LDT = 0x2, 58 DESC_LDT = 0x2,
59 DESCTYPE_S = 0x10, /* !system */
49}; 60};
50 61
51// LDT or TSS descriptor in the GDT. 16 bytes. 62// LDT or TSS descriptor in the GDT. 16 bytes.
52struct ldttss_desc { 63struct ldttss_desc64 {
53 u16 limit0; 64 u16 limit0;
54 u16 base0; 65 u16 base0;
55 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 66 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
@@ -58,6 +69,16 @@ struct ldttss_desc {
58 u32 zero1; 69 u32 zero1;
59} __attribute__((packed)); 70} __attribute__((packed));
60 71
72#ifdef CONFIG_X86_64
73typedef struct gate_struct64 gate_desc;
74typedef struct ldttss_desc64 ldt_desc;
75typedef struct ldttss_desc64 tss_desc;
76#else
77typedef struct desc_struct gate_desc;
78typedef struct desc_struct ldt_desc;
79typedef struct desc_struct tss_desc;
80#endif
81
61struct desc_ptr { 82struct desc_ptr {
62 unsigned short size; 83 unsigned short size;
63 unsigned long address; 84 unsigned long address;
diff --git a/include/asm-x86/dma.h b/include/asm-x86/dma.h
index 9f936c61a4e5..e9733ce89880 100644
--- a/include/asm-x86/dma.h
+++ b/include/asm-x86/dma.h
@@ -1,5 +1,319 @@
1/*
2 * linux/include/asm/dma.h: Defines for using and allocating dma channels.
3 * Written by Hennus Bergman, 1992.
4 * High DMA channel support & info by Hannu Savolainen
5 * and John Boyd, Nov. 1992.
6 */
7
8#ifndef _ASM_X86_DMA_H
9#define _ASM_X86_DMA_H
10
11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14
15
16#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
17#define dma_outb outb_p
18#else
19#define dma_outb outb
20#endif
21
22#define dma_inb inb
23
24/*
25 * NOTES about DMA transfers:
26 *
27 * controller 1: channels 0-3, byte operations, ports 00-1F
28 * controller 2: channels 4-7, word operations, ports C0-DF
29 *
30 * - ALL registers are 8 bits only, regardless of transfer size
31 * - channel 4 is not used - cascades 1 into 2.
32 * - channels 0-3 are byte - addresses/counts are for physical bytes
33 * - channels 5-7 are word - addresses/counts are for physical words
34 * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
35 * - transfer count loaded to registers is 1 less than actual count
36 * - controller 2 offsets are all even (2x offsets for controller 1)
37 * - page registers for 5-7 don't use data bit 0, represent 128K pages
38 * - page registers for 0-3 use bit 0, represent 64K pages
39 *
40 * DMA transfers are limited to the lower 16MB of _physical_ memory.
41 * Note that addresses loaded into registers must be _physical_ addresses,
42 * not logical addresses (which may differ if paging is active).
43 *
44 * Address mapping for channels 0-3:
45 *
46 * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
47 * | ... | | ... | | ... |
48 * | ... | | ... | | ... |
49 * | ... | | ... | | ... |
50 * P7 ... P0 A7 ... A0 A7 ... A0
51 * | Page | Addr MSB | Addr LSB | (DMA registers)
52 *
53 * Address mapping for channels 5-7:
54 *
55 * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
56 * | ... | \ \ ... \ \ \ ... \ \
57 * | ... | \ \ ... \ \ \ ... \ (not used)
58 * | ... | \ \ ... \ \ \ ... \
59 * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
60 * | Page | Addr MSB | Addr LSB | (DMA registers)
61 *
62 * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
63 * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
64 * the hardware level, so odd-byte transfers aren't possible).
65 *
66 * Transfer count (_not # bytes_) is limited to 64K, represented as actual
67 * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
68 * and up to 128K bytes may be transferred on channels 5-7 in one operation.
69 *
70 */
71
72#define MAX_DMA_CHANNELS 8
73
1#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
2# include "dma_32.h" 75
76/* The maximum address that we can perform a DMA transfer to on this platform */
77#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000)
78
79#else
80
81/* 16MB ISA DMA zone */
82#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT)
83
84/* 4GB broken PCI/AGP hardware bus master zone */
85#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
86
87/* Compat define for old dma zone */
88#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
89
90#endif
91
92/* 8237 DMA controllers */
93#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
94#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
95
96/* DMA controller registers */
97#define DMA1_CMD_REG 0x08 /* command register (w) */
98#define DMA1_STAT_REG 0x08 /* status register (r) */
99#define DMA1_REQ_REG 0x09 /* request register (w) */
100#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
101#define DMA1_MODE_REG 0x0B /* mode register (w) */
102#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
103#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
104#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
105#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
106#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
107
108#define DMA2_CMD_REG 0xD0 /* command register (w) */
109#define DMA2_STAT_REG 0xD0 /* status register (r) */
110#define DMA2_REQ_REG 0xD2 /* request register (w) */
111#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
112#define DMA2_MODE_REG 0xD6 /* mode register (w) */
113#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
114#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
115#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
116#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
117#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
118
119#define DMA_ADDR_0 0x00 /* DMA address registers */
120#define DMA_ADDR_1 0x02
121#define DMA_ADDR_2 0x04
122#define DMA_ADDR_3 0x06
123#define DMA_ADDR_4 0xC0
124#define DMA_ADDR_5 0xC4
125#define DMA_ADDR_6 0xC8
126#define DMA_ADDR_7 0xCC
127
128#define DMA_CNT_0 0x01 /* DMA count registers */
129#define DMA_CNT_1 0x03
130#define DMA_CNT_2 0x05
131#define DMA_CNT_3 0x07
132#define DMA_CNT_4 0xC2
133#define DMA_CNT_5 0xC6
134#define DMA_CNT_6 0xCA
135#define DMA_CNT_7 0xCE
136
137#define DMA_PAGE_0 0x87 /* DMA page registers */
138#define DMA_PAGE_1 0x83
139#define DMA_PAGE_2 0x81
140#define DMA_PAGE_3 0x82
141#define DMA_PAGE_5 0x8B
142#define DMA_PAGE_6 0x89
143#define DMA_PAGE_7 0x8A
144
145/* I/O to memory, no autoinit, increment, single mode */
146#define DMA_MODE_READ 0x44
147/* memory to I/O, no autoinit, increment, single mode */
148#define DMA_MODE_WRITE 0x48
149/* pass thru DREQ->HRQ, DACK<-HLDA only */
150#define DMA_MODE_CASCADE 0xC0
151
152#define DMA_AUTOINIT 0x10
153
154
155extern spinlock_t dma_spin_lock;
156
157static __inline__ unsigned long claim_dma_lock(void)
158{
159 unsigned long flags;
160 spin_lock_irqsave(&dma_spin_lock, flags);
161 return flags;
162}
163
164static __inline__ void release_dma_lock(unsigned long flags)
165{
166 spin_unlock_irqrestore(&dma_spin_lock, flags);
167}
168
169/* enable/disable a specific DMA channel */
170static __inline__ void enable_dma(unsigned int dmanr)
171{
172 if (dmanr <= 3)
173 dma_outb(dmanr, DMA1_MASK_REG);
174 else
175 dma_outb(dmanr & 3, DMA2_MASK_REG);
176}
177
178static __inline__ void disable_dma(unsigned int dmanr)
179{
180 if (dmanr <= 3)
181 dma_outb(dmanr | 4, DMA1_MASK_REG);
182 else
183 dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
184}
185
186/* Clear the 'DMA Pointer Flip Flop'.
187 * Write 0 for LSB/MSB, 1 for MSB/LSB access.
188 * Use this once to initialize the FF to a known state.
189 * After that, keep track of it. :-)
190 * --- In order to do that, the DMA routines below should ---
191 * --- only be used while holding the DMA lock ! ---
192 */
193static __inline__ void clear_dma_ff(unsigned int dmanr)
194{
195 if (dmanr <= 3)
196 dma_outb(0, DMA1_CLEAR_FF_REG);
197 else
198 dma_outb(0, DMA2_CLEAR_FF_REG);
199}
200
201/* set mode (above) for a specific DMA channel */
202static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
203{
204 if (dmanr <= 3)
205 dma_outb(mode | dmanr, DMA1_MODE_REG);
206 else
207 dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
208}
209
210/* Set only the page register bits of the transfer address.
211 * This is used for successive transfers when we know the contents of
212 * the lower 16 bits of the DMA current address register, but a 64k boundary
213 * may have been crossed.
214 */
215static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
216{
217 switch (dmanr) {
218 case 0:
219 dma_outb(pagenr, DMA_PAGE_0);
220 break;
221 case 1:
222 dma_outb(pagenr, DMA_PAGE_1);
223 break;
224 case 2:
225 dma_outb(pagenr, DMA_PAGE_2);
226 break;
227 case 3:
228 dma_outb(pagenr, DMA_PAGE_3);
229 break;
230 case 5:
231 dma_outb(pagenr & 0xfe, DMA_PAGE_5);
232 break;
233 case 6:
234 dma_outb(pagenr & 0xfe, DMA_PAGE_6);
235 break;
236 case 7:
237 dma_outb(pagenr & 0xfe, DMA_PAGE_7);
238 break;
239 }
240}
241
242
243/* Set transfer address & page bits for specific DMA channel.
244 * Assumes dma flipflop is clear.
245 */
246static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
247{
248 set_dma_page(dmanr, a>>16);
249 if (dmanr <= 3) {
250 dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
251 dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
252 } else {
253 dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
254 dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
255 }
256}
257
258
259/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
260 * a specific DMA channel.
261 * You must ensure the parameters are valid.
262 * NOTE: from a manual: "the number of transfers is one more
263 * than the initial word count"! This is taken into account.
264 * Assumes dma flip-flop is clear.
265 * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
266 */
267static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
268{
269 count--;
270 if (dmanr <= 3) {
271 dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
272 dma_outb((count >> 8) & 0xff,
273 ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
274 } else {
275 dma_outb((count >> 1) & 0xff,
276 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
277 dma_outb((count >> 9) & 0xff,
278 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
279 }
280}
281
282
283/* Get DMA residue count. After a DMA transfer, this
284 * should return zero. Reading this while a DMA transfer is
285 * still in progress will return unpredictable results.
286 * If called before the channel has been used, it may return 1.
287 * Otherwise, it returns the number of _bytes_ left to transfer.
288 *
289 * Assumes DMA flip-flop is clear.
290 */
291static __inline__ int get_dma_residue(unsigned int dmanr)
292{
293 unsigned int io_port;
294 /* using short to get 16-bit wrap around */
295 unsigned short count;
296
297 io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
298 : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
299
300 count = 1 + dma_inb(io_port);
301 count += dma_inb(io_port) << 8;
302
303 return (dmanr <= 3) ? count : (count << 1);
304}
305
306
307/* These are in kernel/dma.c: */
308extern int request_dma(unsigned int dmanr, const char *device_id);
309extern void free_dma(unsigned int dmanr);
310
311/* From PCI */
312
313#ifdef CONFIG_PCI
314extern int isa_dma_bridge_buggy;
3#else 315#else
4# include "dma_64.h" 316#define isa_dma_bridge_buggy (0)
5#endif 317#endif
318
319#endif /* _ASM_X86_DMA_H */
diff --git a/include/asm-x86/dma_32.h b/include/asm-x86/dma_32.h
deleted file mode 100644
index d23aac8e1a50..000000000000
--- a/include/asm-x86/dma_32.h
+++ /dev/null
@@ -1,297 +0,0 @@
1/* $Id: dma.h,v 1.7 1992/12/14 00:29:34 root Exp root $
2 * linux/include/asm/dma.h: Defines for using and allocating dma channels.
3 * Written by Hennus Bergman, 1992.
4 * High DMA channel support & info by Hannu Savolainen
5 * and John Boyd, Nov. 1992.
6 */
7
8#ifndef _ASM_DMA_H
9#define _ASM_DMA_H
10
11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14
15
16#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
17#define dma_outb outb_p
18#else
19#define dma_outb outb
20#endif
21
22#define dma_inb inb
23
24/*
25 * NOTES about DMA transfers:
26 *
27 * controller 1: channels 0-3, byte operations, ports 00-1F
28 * controller 2: channels 4-7, word operations, ports C0-DF
29 *
30 * - ALL registers are 8 bits only, regardless of transfer size
31 * - channel 4 is not used - cascades 1 into 2.
32 * - channels 0-3 are byte - addresses/counts are for physical bytes
33 * - channels 5-7 are word - addresses/counts are for physical words
34 * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
35 * - transfer count loaded to registers is 1 less than actual count
36 * - controller 2 offsets are all even (2x offsets for controller 1)
37 * - page registers for 5-7 don't use data bit 0, represent 128K pages
38 * - page registers for 0-3 use bit 0, represent 64K pages
39 *
40 * DMA transfers are limited to the lower 16MB of _physical_ memory.
41 * Note that addresses loaded into registers must be _physical_ addresses,
42 * not logical addresses (which may differ if paging is active).
43 *
44 * Address mapping for channels 0-3:
45 *
46 * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
47 * | ... | | ... | | ... |
48 * | ... | | ... | | ... |
49 * | ... | | ... | | ... |
50 * P7 ... P0 A7 ... A0 A7 ... A0
51 * | Page | Addr MSB | Addr LSB | (DMA registers)
52 *
53 * Address mapping for channels 5-7:
54 *
55 * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
56 * | ... | \ \ ... \ \ \ ... \ \
57 * | ... | \ \ ... \ \ \ ... \ (not used)
58 * | ... | \ \ ... \ \ \ ... \
59 * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
60 * | Page | Addr MSB | Addr LSB | (DMA registers)
61 *
62 * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
63 * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
64 * the hardware level, so odd-byte transfers aren't possible).
65 *
66 * Transfer count (_not # bytes_) is limited to 64K, represented as actual
67 * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
68 * and up to 128K bytes may be transferred on channels 5-7 in one operation.
69 *
70 */
71
72#define MAX_DMA_CHANNELS 8
73
74/* The maximum address that we can perform a DMA transfer to on this platform */
75#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000)
76
77/* 8237 DMA controllers */
78#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
79#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
80
81/* DMA controller registers */
82#define DMA1_CMD_REG 0x08 /* command register (w) */
83#define DMA1_STAT_REG 0x08 /* status register (r) */
84#define DMA1_REQ_REG 0x09 /* request register (w) */
85#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
86#define DMA1_MODE_REG 0x0B /* mode register (w) */
87#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
88#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
89#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
90#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
91#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
92
93#define DMA2_CMD_REG 0xD0 /* command register (w) */
94#define DMA2_STAT_REG 0xD0 /* status register (r) */
95#define DMA2_REQ_REG 0xD2 /* request register (w) */
96#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
97#define DMA2_MODE_REG 0xD6 /* mode register (w) */
98#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
99#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
100#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
101#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
102#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
103
104#define DMA_ADDR_0 0x00 /* DMA address registers */
105#define DMA_ADDR_1 0x02
106#define DMA_ADDR_2 0x04
107#define DMA_ADDR_3 0x06
108#define DMA_ADDR_4 0xC0
109#define DMA_ADDR_5 0xC4
110#define DMA_ADDR_6 0xC8
111#define DMA_ADDR_7 0xCC
112
113#define DMA_CNT_0 0x01 /* DMA count registers */
114#define DMA_CNT_1 0x03
115#define DMA_CNT_2 0x05
116#define DMA_CNT_3 0x07
117#define DMA_CNT_4 0xC2
118#define DMA_CNT_5 0xC6
119#define DMA_CNT_6 0xCA
120#define DMA_CNT_7 0xCE
121
122#define DMA_PAGE_0 0x87 /* DMA page registers */
123#define DMA_PAGE_1 0x83
124#define DMA_PAGE_2 0x81
125#define DMA_PAGE_3 0x82
126#define DMA_PAGE_5 0x8B
127#define DMA_PAGE_6 0x89
128#define DMA_PAGE_7 0x8A
129
130#define DMA_MODE_READ 0x44 /* I/O to memory, no autoinit, increment, single mode */
131#define DMA_MODE_WRITE 0x48 /* memory to I/O, no autoinit, increment, single mode */
132#define DMA_MODE_CASCADE 0xC0 /* pass thru DREQ->HRQ, DACK<-HLDA only */
133
134#define DMA_AUTOINIT 0x10
135
136
137extern spinlock_t dma_spin_lock;
138
139static __inline__ unsigned long claim_dma_lock(void)
140{
141 unsigned long flags;
142 spin_lock_irqsave(&dma_spin_lock, flags);
143 return flags;
144}
145
146static __inline__ void release_dma_lock(unsigned long flags)
147{
148 spin_unlock_irqrestore(&dma_spin_lock, flags);
149}
150
151/* enable/disable a specific DMA channel */
152static __inline__ void enable_dma(unsigned int dmanr)
153{
154 if (dmanr<=3)
155 dma_outb(dmanr, DMA1_MASK_REG);
156 else
157 dma_outb(dmanr & 3, DMA2_MASK_REG);
158}
159
160static __inline__ void disable_dma(unsigned int dmanr)
161{
162 if (dmanr<=3)
163 dma_outb(dmanr | 4, DMA1_MASK_REG);
164 else
165 dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
166}
167
168/* Clear the 'DMA Pointer Flip Flop'.
169 * Write 0 for LSB/MSB, 1 for MSB/LSB access.
170 * Use this once to initialize the FF to a known state.
171 * After that, keep track of it. :-)
172 * --- In order to do that, the DMA routines below should ---
173 * --- only be used while holding the DMA lock ! ---
174 */
175static __inline__ void clear_dma_ff(unsigned int dmanr)
176{
177 if (dmanr<=3)
178 dma_outb(0, DMA1_CLEAR_FF_REG);
179 else
180 dma_outb(0, DMA2_CLEAR_FF_REG);
181}
182
183/* set mode (above) for a specific DMA channel */
184static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
185{
186 if (dmanr<=3)
187 dma_outb(mode | dmanr, DMA1_MODE_REG);
188 else
189 dma_outb(mode | (dmanr&3), DMA2_MODE_REG);
190}
191
192/* Set only the page register bits of the transfer address.
193 * This is used for successive transfers when we know the contents of
194 * the lower 16 bits of the DMA current address register, but a 64k boundary
195 * may have been crossed.
196 */
197static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
198{
199 switch(dmanr) {
200 case 0:
201 dma_outb(pagenr, DMA_PAGE_0);
202 break;
203 case 1:
204 dma_outb(pagenr, DMA_PAGE_1);
205 break;
206 case 2:
207 dma_outb(pagenr, DMA_PAGE_2);
208 break;
209 case 3:
210 dma_outb(pagenr, DMA_PAGE_3);
211 break;
212 case 5:
213 dma_outb(pagenr & 0xfe, DMA_PAGE_5);
214 break;
215 case 6:
216 dma_outb(pagenr & 0xfe, DMA_PAGE_6);
217 break;
218 case 7:
219 dma_outb(pagenr & 0xfe, DMA_PAGE_7);
220 break;
221 }
222}
223
224
225/* Set transfer address & page bits for specific DMA channel.
226 * Assumes dma flipflop is clear.
227 */
228static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
229{
230 set_dma_page(dmanr, a>>16);
231 if (dmanr <= 3) {
232 dma_outb( a & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
233 dma_outb( (a>>8) & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
234 } else {
235 dma_outb( (a>>1) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
236 dma_outb( (a>>9) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
237 }
238}
239
240
241/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
242 * a specific DMA channel.
243 * You must ensure the parameters are valid.
244 * NOTE: from a manual: "the number of transfers is one more
245 * than the initial word count"! This is taken into account.
246 * Assumes dma flip-flop is clear.
247 * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
248 */
249static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
250{
251 count--;
252 if (dmanr <= 3) {
253 dma_outb( count & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
254 dma_outb( (count>>8) & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
255 } else {
256 dma_outb( (count>>1) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
257 dma_outb( (count>>9) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
258 }
259}
260
261
262/* Get DMA residue count. After a DMA transfer, this
263 * should return zero. Reading this while a DMA transfer is
264 * still in progress will return unpredictable results.
265 * If called before the channel has been used, it may return 1.
266 * Otherwise, it returns the number of _bytes_ left to transfer.
267 *
268 * Assumes DMA flip-flop is clear.
269 */
270static __inline__ int get_dma_residue(unsigned int dmanr)
271{
272 unsigned int io_port = (dmanr<=3)? ((dmanr&3)<<1) + 1 + IO_DMA1_BASE
273 : ((dmanr&3)<<2) + 2 + IO_DMA2_BASE;
274
275 /* using short to get 16-bit wrap around */
276 unsigned short count;
277
278 count = 1 + dma_inb(io_port);
279 count += dma_inb(io_port) << 8;
280
281 return (dmanr<=3)? count : (count<<1);
282}
283
284
285/* These are in kernel/dma.c: */
286extern int request_dma(unsigned int dmanr, const char * device_id); /* reserve a DMA channel */
287extern void free_dma(unsigned int dmanr); /* release it again */
288
289/* From PCI */
290
291#ifdef CONFIG_PCI
292extern int isa_dma_bridge_buggy;
293#else
294#define isa_dma_bridge_buggy (0)
295#endif
296
297#endif /* _ASM_DMA_H */
diff --git a/include/asm-x86/dma_64.h b/include/asm-x86/dma_64.h
deleted file mode 100644
index a37c16f06289..000000000000
--- a/include/asm-x86/dma_64.h
+++ /dev/null
@@ -1,304 +0,0 @@
1/*
2 * linux/include/asm/dma.h: Defines for using and allocating dma channels.
3 * Written by Hennus Bergman, 1992.
4 * High DMA channel support & info by Hannu Savolainen
5 * and John Boyd, Nov. 1992.
6 */
7
8#ifndef _ASM_DMA_H
9#define _ASM_DMA_H
10
11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14
15
16#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
17#define dma_outb outb_p
18#else
19#define dma_outb outb
20#endif
21
22#define dma_inb inb
23
24/*
25 * NOTES about DMA transfers:
26 *
27 * controller 1: channels 0-3, byte operations, ports 00-1F
28 * controller 2: channels 4-7, word operations, ports C0-DF
29 *
30 * - ALL registers are 8 bits only, regardless of transfer size
31 * - channel 4 is not used - cascades 1 into 2.
32 * - channels 0-3 are byte - addresses/counts are for physical bytes
33 * - channels 5-7 are word - addresses/counts are for physical words
34 * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
35 * - transfer count loaded to registers is 1 less than actual count
36 * - controller 2 offsets are all even (2x offsets for controller 1)
37 * - page registers for 5-7 don't use data bit 0, represent 128K pages
38 * - page registers for 0-3 use bit 0, represent 64K pages
39 *
40 * DMA transfers are limited to the lower 16MB of _physical_ memory.
41 * Note that addresses loaded into registers must be _physical_ addresses,
42 * not logical addresses (which may differ if paging is active).
43 *
44 * Address mapping for channels 0-3:
45 *
46 * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
47 * | ... | | ... | | ... |
48 * | ... | | ... | | ... |
49 * | ... | | ... | | ... |
50 * P7 ... P0 A7 ... A0 A7 ... A0
51 * | Page | Addr MSB | Addr LSB | (DMA registers)
52 *
53 * Address mapping for channels 5-7:
54 *
55 * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
56 * | ... | \ \ ... \ \ \ ... \ \
57 * | ... | \ \ ... \ \ \ ... \ (not used)
58 * | ... | \ \ ... \ \ \ ... \
59 * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
60 * | Page | Addr MSB | Addr LSB | (DMA registers)
61 *
62 * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
63 * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
64 * the hardware level, so odd-byte transfers aren't possible).
65 *
66 * Transfer count (_not # bytes_) is limited to 64K, represented as actual
67 * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
68 * and up to 128K bytes may be transferred on channels 5-7 in one operation.
69 *
70 */
71
72#define MAX_DMA_CHANNELS 8
73
74
75/* 16MB ISA DMA zone */
76#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT)
77
78/* 4GB broken PCI/AGP hardware bus master zone */
79#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
80
81/* Compat define for old dma zone */
82#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
83
84/* 8237 DMA controllers */
85#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
86#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
87
88/* DMA controller registers */
89#define DMA1_CMD_REG 0x08 /* command register (w) */
90#define DMA1_STAT_REG 0x08 /* status register (r) */
91#define DMA1_REQ_REG 0x09 /* request register (w) */
92#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
93#define DMA1_MODE_REG 0x0B /* mode register (w) */
94#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
95#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
96#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
97#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
98#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
99
100#define DMA2_CMD_REG 0xD0 /* command register (w) */
101#define DMA2_STAT_REG 0xD0 /* status register (r) */
102#define DMA2_REQ_REG 0xD2 /* request register (w) */
103#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
104#define DMA2_MODE_REG 0xD6 /* mode register (w) */
105#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
106#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
107#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
108#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
109#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
110
111#define DMA_ADDR_0 0x00 /* DMA address registers */
112#define DMA_ADDR_1 0x02
113#define DMA_ADDR_2 0x04
114#define DMA_ADDR_3 0x06
115#define DMA_ADDR_4 0xC0
116#define DMA_ADDR_5 0xC4
117#define DMA_ADDR_6 0xC8
118#define DMA_ADDR_7 0xCC
119
120#define DMA_CNT_0 0x01 /* DMA count registers */
121#define DMA_CNT_1 0x03
122#define DMA_CNT_2 0x05
123#define DMA_CNT_3 0x07
124#define DMA_CNT_4 0xC2
125#define DMA_CNT_5 0xC6
126#define DMA_CNT_6 0xCA
127#define DMA_CNT_7 0xCE
128
129#define DMA_PAGE_0 0x87 /* DMA page registers */
130#define DMA_PAGE_1 0x83
131#define DMA_PAGE_2 0x81
132#define DMA_PAGE_3 0x82
133#define DMA_PAGE_5 0x8B
134#define DMA_PAGE_6 0x89
135#define DMA_PAGE_7 0x8A
136
137#define DMA_MODE_READ 0x44 /* I/O to memory, no autoinit, increment, single mode */
138#define DMA_MODE_WRITE 0x48 /* memory to I/O, no autoinit, increment, single mode */
139#define DMA_MODE_CASCADE 0xC0 /* pass thru DREQ->HRQ, DACK<-HLDA only */
140
141#define DMA_AUTOINIT 0x10
142
143
144extern spinlock_t dma_spin_lock;
145
146static __inline__ unsigned long claim_dma_lock(void)
147{
148 unsigned long flags;
149 spin_lock_irqsave(&dma_spin_lock, flags);
150 return flags;
151}
152
153static __inline__ void release_dma_lock(unsigned long flags)
154{
155 spin_unlock_irqrestore(&dma_spin_lock, flags);
156}
157
158/* enable/disable a specific DMA channel */
159static __inline__ void enable_dma(unsigned int dmanr)
160{
161 if (dmanr<=3)
162 dma_outb(dmanr, DMA1_MASK_REG);
163 else
164 dma_outb(dmanr & 3, DMA2_MASK_REG);
165}
166
167static __inline__ void disable_dma(unsigned int dmanr)
168{
169 if (dmanr<=3)
170 dma_outb(dmanr | 4, DMA1_MASK_REG);
171 else
172 dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
173}
174
175/* Clear the 'DMA Pointer Flip Flop'.
176 * Write 0 for LSB/MSB, 1 for MSB/LSB access.
177 * Use this once to initialize the FF to a known state.
178 * After that, keep track of it. :-)
179 * --- In order to do that, the DMA routines below should ---
180 * --- only be used while holding the DMA lock ! ---
181 */
182static __inline__ void clear_dma_ff(unsigned int dmanr)
183{
184 if (dmanr<=3)
185 dma_outb(0, DMA1_CLEAR_FF_REG);
186 else
187 dma_outb(0, DMA2_CLEAR_FF_REG);
188}
189
190/* set mode (above) for a specific DMA channel */
191static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
192{
193 if (dmanr<=3)
194 dma_outb(mode | dmanr, DMA1_MODE_REG);
195 else
196 dma_outb(mode | (dmanr&3), DMA2_MODE_REG);
197}
198
199/* Set only the page register bits of the transfer address.
200 * This is used for successive transfers when we know the contents of
201 * the lower 16 bits of the DMA current address register, but a 64k boundary
202 * may have been crossed.
203 */
204static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
205{
206 switch(dmanr) {
207 case 0:
208 dma_outb(pagenr, DMA_PAGE_0);
209 break;
210 case 1:
211 dma_outb(pagenr, DMA_PAGE_1);
212 break;
213 case 2:
214 dma_outb(pagenr, DMA_PAGE_2);
215 break;
216 case 3:
217 dma_outb(pagenr, DMA_PAGE_3);
218 break;
219 case 5:
220 dma_outb(pagenr & 0xfe, DMA_PAGE_5);
221 break;
222 case 6:
223 dma_outb(pagenr & 0xfe, DMA_PAGE_6);
224 break;
225 case 7:
226 dma_outb(pagenr & 0xfe, DMA_PAGE_7);
227 break;
228 }
229}
230
231
232/* Set transfer address & page bits for specific DMA channel.
233 * Assumes dma flipflop is clear.
234 */
235static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
236{
237 set_dma_page(dmanr, a>>16);
238 if (dmanr <= 3) {
239 dma_outb( a & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
240 dma_outb( (a>>8) & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
241 } else {
242 dma_outb( (a>>1) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
243 dma_outb( (a>>9) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
244 }
245}
246
247
248/* Set transfer size (max 64k for DMA1..3, 128k for DMA5..7) for
249 * a specific DMA channel.
250 * You must ensure the parameters are valid.
251 * NOTE: from a manual: "the number of transfers is one more
252 * than the initial word count"! This is taken into account.
253 * Assumes dma flip-flop is clear.
254 * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
255 */
256static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
257{
258 count--;
259 if (dmanr <= 3) {
260 dma_outb( count & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
261 dma_outb( (count>>8) & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
262 } else {
263 dma_outb( (count>>1) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
264 dma_outb( (count>>9) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
265 }
266}
267
268
269/* Get DMA residue count. After a DMA transfer, this
270 * should return zero. Reading this while a DMA transfer is
271 * still in progress will return unpredictable results.
272 * If called before the channel has been used, it may return 1.
273 * Otherwise, it returns the number of _bytes_ left to transfer.
274 *
275 * Assumes DMA flip-flop is clear.
276 */
277static __inline__ int get_dma_residue(unsigned int dmanr)
278{
279 unsigned int io_port = (dmanr<=3)? ((dmanr&3)<<1) + 1 + IO_DMA1_BASE
280 : ((dmanr&3)<<2) + 2 + IO_DMA2_BASE;
281
282 /* using short to get 16-bit wrap around */
283 unsigned short count;
284
285 count = 1 + dma_inb(io_port);
286 count += dma_inb(io_port) << 8;
287
288 return (dmanr<=3)? count : (count<<1);
289}
290
291
292/* These are in kernel/dma.c: */
293extern int request_dma(unsigned int dmanr, const char * device_id); /* reserve a DMA channel */
294extern void free_dma(unsigned int dmanr); /* release it again */
295
296/* From PCI */
297
298#ifdef CONFIG_PCI
299extern int isa_dma_bridge_buggy;
300#else
301#define isa_dma_bridge_buggy (0)
302#endif
303
304#endif /* _ASM_DMA_H */
diff --git a/include/asm-x86/dmi.h b/include/asm-x86/dmi.h
index 8e2b0e6aa8e7..1241e6ad1935 100644
--- a/include/asm-x86/dmi.h
+++ b/include/asm-x86/dmi.h
@@ -5,9 +5,6 @@
5 5
6#ifdef CONFIG_X86_32 6#ifdef CONFIG_X86_32
7 7
8/* Use early IO mappings for DMI because it's initialized early */
9#define dmi_ioremap bt_ioremap
10#define dmi_iounmap bt_iounmap
11#define dmi_alloc alloc_bootmem 8#define dmi_alloc alloc_bootmem
12 9
13#else /* CONFIG_X86_32 */ 10#else /* CONFIG_X86_32 */
@@ -22,14 +19,15 @@ extern char dmi_alloc_data[DMI_MAX_DATA];
22static inline void *dmi_alloc(unsigned len) 19static inline void *dmi_alloc(unsigned len)
23{ 20{
24 int idx = dmi_alloc_index; 21 int idx = dmi_alloc_index;
25 if ((dmi_alloc_index += len) > DMI_MAX_DATA) 22 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
26 return NULL; 23 return NULL;
24 dmi_alloc_index += len;
27 return dmi_alloc_data + idx; 25 return dmi_alloc_data + idx;
28} 26}
29 27
28#endif
29
30#define dmi_ioremap early_ioremap 30#define dmi_ioremap early_ioremap
31#define dmi_iounmap early_iounmap 31#define dmi_iounmap early_iounmap
32 32
33#endif 33#endif
34
35#endif
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h
new file mode 100644
index 000000000000..7881368142fa
--- /dev/null
+++ b/include/asm-x86/ds.h
@@ -0,0 +1,72 @@
1/*
2 * Debug Store (DS) support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and
6 * precise-event based sampling (PEBS).
7 *
8 * Different architectures use a different DS layout/pointer size.
9 * The below functions therefore work on a void*.
10 *
11 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 *
15 *
16 * Copyright (C) 2007 Intel Corporation.
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */
19
20#ifndef _ASM_X86_DS_H
21#define _ASM_X86_DS_H
22
23#include <linux/types.h>
24#include <linux/init.h>
25
26struct cpuinfo_x86;
27
28
29/* a branch trace record entry
30 *
31 * In order to unify the interface between various processor versions,
32 * we use the below data structure for all processors.
33 */
34enum bts_qualifier {
35 BTS_INVALID = 0,
36 BTS_BRANCH,
37 BTS_TASK_ARRIVES,
38 BTS_TASK_DEPARTS
39};
40
41struct bts_struct {
42 u64 qualifier;
43 union {
44 /* BTS_BRANCH */
45 struct {
46 u64 from_ip;
47 u64 to_ip;
48 } lbr;
49 /* BTS_TASK_ARRIVES or
50 BTS_TASK_DEPARTS */
51 u64 jiffies;
52 } variant;
53};
54
55/* Overflow handling mechanisms */
56#define DS_O_SIGNAL 1 /* send overflow signal */
57#define DS_O_WRAP 2 /* wrap around */
58
59extern int ds_allocate(void **, size_t);
60extern int ds_free(void **);
61extern int ds_get_bts_size(void *);
62extern int ds_get_bts_end(void *);
63extern int ds_get_bts_index(void *);
64extern int ds_set_overflow(void *, int);
65extern int ds_get_overflow(void *);
66extern int ds_clear(void *);
67extern int ds_read_bts(void *, int, struct bts_struct *);
68extern int ds_write_bts(void *, const struct bts_struct *);
69extern unsigned long ds_debugctl_mask(void);
70extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *c);
71
72#endif /* _ASM_X86_DS_H */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 3e214f39fad3..7004251fc66b 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -22,6 +22,12 @@ struct e820map {
22}; 22};
23#endif /* __ASSEMBLY__ */ 23#endif /* __ASSEMBLY__ */
24 24
25#define ISA_START_ADDRESS 0xa0000
26#define ISA_END_ADDRESS 0x100000
27
28#define BIOS_BEGIN 0x000a0000
29#define BIOS_END 0x00100000
30
25#ifdef __KERNEL__ 31#ifdef __KERNEL__
26#ifdef CONFIG_X86_32 32#ifdef CONFIG_X86_32
27# include "e820_32.h" 33# include "e820_32.h"
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 03f60c690c8a..f1da7ebd1905 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -12,20 +12,28 @@
12#ifndef __E820_HEADER 12#ifndef __E820_HEADER
13#define __E820_HEADER 13#define __E820_HEADER
14 14
15#include <linux/ioport.h>
16
15#define HIGH_MEMORY (1024*1024) 17#define HIGH_MEMORY (1024*1024)
16 18
17#ifndef __ASSEMBLY__ 19#ifndef __ASSEMBLY__
18 20
19extern struct e820map e820; 21extern struct e820map e820;
22extern void update_e820(void);
20 23
21extern int e820_all_mapped(unsigned long start, unsigned long end, 24extern int e820_all_mapped(unsigned long start, unsigned long end,
22 unsigned type); 25 unsigned type);
23extern int e820_any_mapped(u64 start, u64 end, unsigned type); 26extern int e820_any_mapped(u64 start, u64 end, unsigned type);
24extern void find_max_pfn(void); 27extern void find_max_pfn(void);
25extern void register_bootmem_low_pages(unsigned long max_low_pfn); 28extern void register_bootmem_low_pages(unsigned long max_low_pfn);
29extern void add_memory_region(unsigned long long start,
30 unsigned long long size, int type);
26extern void e820_register_memory(void); 31extern void e820_register_memory(void);
27extern void limit_regions(unsigned long long size); 32extern void limit_regions(unsigned long long size);
28extern void print_memory_map(char *who); 33extern void print_memory_map(char *who);
34extern void init_iomem_resources(struct resource *code_resource,
35 struct resource *data_resource,
36 struct resource *bss_resource);
29 37
30#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) 38#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
31extern void e820_mark_nosave_regions(void); 39extern void e820_mark_nosave_regions(void);
@@ -35,5 +43,6 @@ static inline void e820_mark_nosave_regions(void)
35} 43}
36#endif 44#endif
37 45
46
38#endif/*!__ASSEMBLY__*/ 47#endif/*!__ASSEMBLY__*/
39#endif/*__E820_HEADER*/ 48#endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 0bd4787a5d57..51e4170f9ca5 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -11,6 +11,8 @@
11#ifndef __E820_HEADER 11#ifndef __E820_HEADER
12#define __E820_HEADER 12#define __E820_HEADER
13 13
14#include <linux/ioport.h>
15
14#ifndef __ASSEMBLY__ 16#ifndef __ASSEMBLY__
15extern unsigned long find_e820_area(unsigned long start, unsigned long end, 17extern unsigned long find_e820_area(unsigned long start, unsigned long end,
16 unsigned size); 18 unsigned size);
@@ -19,11 +21,15 @@ extern void add_memory_region(unsigned long start, unsigned long size,
19extern void setup_memory_region(void); 21extern void setup_memory_region(void);
20extern void contig_e820_setup(void); 22extern void contig_e820_setup(void);
21extern unsigned long e820_end_of_ram(void); 23extern unsigned long e820_end_of_ram(void);
22extern void e820_reserve_resources(void); 24extern void e820_reserve_resources(struct resource *code_resource,
25 struct resource *data_resource, struct resource *bss_resource);
23extern void e820_mark_nosave_regions(void); 26extern void e820_mark_nosave_regions(void);
24extern void e820_print_map(char *who);
25extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); 27extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
26extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); 28extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
29extern int e820_any_non_reserved(unsigned long start, unsigned long end);
30extern int is_memory_any_valid(unsigned long start, unsigned long end);
31extern int e820_all_non_reserved(unsigned long start, unsigned long end);
32extern int is_memory_all_valid(unsigned long start, unsigned long end);
27extern unsigned long e820_hole_size(unsigned long start, unsigned long end); 33extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
28 34
29extern void e820_setup_gap(void); 35extern void e820_setup_gap(void);
@@ -33,9 +39,11 @@ extern void e820_register_active_regions(int nid,
33extern void finish_e820_parsing(void); 39extern void finish_e820_parsing(void);
34 40
35extern struct e820map e820; 41extern struct e820map e820;
42extern void update_e820(void);
43
44extern void reserve_early(unsigned long start, unsigned long end);
45extern void early_res_to_bootmem(void);
36 46
37extern unsigned ebda_addr, ebda_size;
38extern unsigned long nodemap_addr, nodemap_size;
39#endif/*!__ASSEMBLY__*/ 47#endif/*!__ASSEMBLY__*/
40 48
41#endif/*__E820_HEADER*/ 49#endif/*__E820_HEADER*/
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
new file mode 100644
index 000000000000..9c68a1f098d8
--- /dev/null
+++ b/include/asm-x86/efi.h
@@ -0,0 +1,97 @@
1#ifndef _ASM_X86_EFI_H
2#define _ASM_X86_EFI_H
3
4#ifdef CONFIG_X86_32
5
6extern unsigned long asmlinkage efi_call_phys(void *, ...);
7
8#define efi_call_phys0(f) efi_call_phys(f)
9#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
10#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
11#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
12#define efi_call_phys4(f, a1, a2, a3, a4) \
13 efi_call_phys(f, a1, a2, a3, a4)
14#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
15 efi_call_phys(f, a1, a2, a3, a4, a5)
16#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
17 efi_call_phys(f, a1, a2, a3, a4, a5, a6)
18/*
19 * Wrap all the virtual calls in a way that forces the parameters on the stack.
20 */
21
22#define efi_call_virt(f, args...) \
23 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
24
25#define efi_call_virt0(f) efi_call_virt(f)
26#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
27#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
28#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
29#define efi_call_virt4(f, a1, a2, a3, a4) \
30 efi_call_virt(f, a1, a2, a3, a4)
31#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
32 efi_call_virt(f, a1, a2, a3, a4, a5)
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35
36#define efi_ioremap(addr, size) ioremap(addr, size)
37
38#else /* !CONFIG_X86_32 */
39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
45extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
46extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
47extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
48 u64 arg4, u64 arg5);
49extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
50 u64 arg4, u64 arg5, u64 arg6);
51
52#define efi_call_phys0(f) \
53 efi_call0((void *)(f))
54#define efi_call_phys1(f, a1) \
55 efi_call1((void *)(f), (u64)(a1))
56#define efi_call_phys2(f, a1, a2) \
57 efi_call2((void *)(f), (u64)(a1), (u64)(a2))
58#define efi_call_phys3(f, a1, a2, a3) \
59 efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
60#define efi_call_phys4(f, a1, a2, a3, a4) \
61 efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
62 (u64)(a4))
63#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
64 efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
65 (u64)(a4), (u64)(a5))
66#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
67 efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
68 (u64)(a4), (u64)(a5), (u64)(a6))
69
70#define efi_call_virt0(f) \
71 efi_call0((void *)(efi.systab->runtime->f))
72#define efi_call_virt1(f, a1) \
73 efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
74#define efi_call_virt2(f, a1, a2) \
75 efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
76#define efi_call_virt3(f, a1, a2, a3) \
77 efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
78 (u64)(a3))
79#define efi_call_virt4(f, a1, a2, a3, a4) \
80 efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
81 (u64)(a3), (u64)(a4))
82#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
83 efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
84 (u64)(a3), (u64)(a4), (u64)(a5))
85#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
86 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
87 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
88
89extern void *efi_ioremap(unsigned long offset, unsigned long size);
90
91#endif /* CONFIG_X86_32 */
92
93extern void efi_reserve_bootmem(void);
94extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void);
96
97#endif
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index ec42a4d2e83b..d9c94e707289 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -73,18 +73,23 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
73#endif 73#endif
74 74
75#ifdef __KERNEL__ 75#ifdef __KERNEL__
76#include <asm/vdso.h>
76 77
77#ifdef CONFIG_X86_32 78extern unsigned int vdso_enabled;
78#include <asm/processor.h>
79#include <asm/system.h> /* for savesegment */
80#include <asm/desc.h>
81 79
82/* 80/*
83 * This is used to ensure we don't load something for the wrong architecture. 81 * This is used to ensure we don't load something for the wrong architecture.
84 */ 82 */
85#define elf_check_arch(x) \ 83#define elf_check_arch_ia32(x) \
86 (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) 84 (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
87 85
86#ifdef CONFIG_X86_32
87#include <asm/processor.h>
88#include <asm/system.h> /* for savesegment */
89#include <asm/desc.h>
90
91#define elf_check_arch(x) elf_check_arch_ia32(x)
92
88/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx 93/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
89 contains a pointer to a function which might be registered using `atexit'. 94 contains a pointer to a function which might be registered using `atexit'.
90 This provides a mean for the dynamic linker to call DT_FINI functions for 95 This provides a mean for the dynamic linker to call DT_FINI functions for
@@ -96,36 +101,38 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
96 just to make things more deterministic. 101 just to make things more deterministic.
97 */ 102 */
98#define ELF_PLAT_INIT(_r, load_addr) do { \ 103#define ELF_PLAT_INIT(_r, load_addr) do { \
99 _r->ebx = 0; _r->ecx = 0; _r->edx = 0; \ 104 _r->bx = 0; _r->cx = 0; _r->dx = 0; \
100 _r->esi = 0; _r->edi = 0; _r->ebp = 0; \ 105 _r->si = 0; _r->di = 0; _r->bp = 0; \
101 _r->eax = 0; \ 106 _r->ax = 0; \
102} while (0) 107} while (0)
103 108
104/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is 109/*
105 now struct_user_regs, they are different) */ 110 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
106 111 * now struct_user_regs, they are different)
107#define ELF_CORE_COPY_REGS(pr_reg, regs) \ 112 */
108 pr_reg[0] = regs->ebx; \ 113
109 pr_reg[1] = regs->ecx; \ 114#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
110 pr_reg[2] = regs->edx; \ 115 pr_reg[0] = regs->bx; \
111 pr_reg[3] = regs->esi; \ 116 pr_reg[1] = regs->cx; \
112 pr_reg[4] = regs->edi; \ 117 pr_reg[2] = regs->dx; \
113 pr_reg[5] = regs->ebp; \ 118 pr_reg[3] = regs->si; \
114 pr_reg[6] = regs->eax; \ 119 pr_reg[4] = regs->di; \
115 pr_reg[7] = regs->xds & 0xffff; \ 120 pr_reg[5] = regs->bp; \
116 pr_reg[8] = regs->xes & 0xffff; \ 121 pr_reg[6] = regs->ax; \
117 pr_reg[9] = regs->xfs & 0xffff; \ 122 pr_reg[7] = regs->ds & 0xffff; \
118 savesegment(gs,pr_reg[10]); \ 123 pr_reg[8] = regs->es & 0xffff; \
119 pr_reg[11] = regs->orig_eax; \ 124 pr_reg[9] = regs->fs & 0xffff; \
120 pr_reg[12] = regs->eip; \ 125 savesegment(gs, pr_reg[10]); \
121 pr_reg[13] = regs->xcs & 0xffff; \ 126 pr_reg[11] = regs->orig_ax; \
122 pr_reg[14] = regs->eflags; \ 127 pr_reg[12] = regs->ip; \
123 pr_reg[15] = regs->esp; \ 128 pr_reg[13] = regs->cs & 0xffff; \
124 pr_reg[16] = regs->xss & 0xffff; 129 pr_reg[14] = regs->flags; \
130 pr_reg[15] = regs->sp; \
131 pr_reg[16] = regs->ss & 0xffff; \
132} while (0);
125 133
126#define ELF_PLATFORM (utsname()->machine) 134#define ELF_PLATFORM (utsname()->machine)
127#define set_personality_64bit() do { } while (0) 135#define set_personality_64bit() do { } while (0)
128extern unsigned int vdso_enabled;
129 136
130#else /* CONFIG_X86_32 */ 137#else /* CONFIG_X86_32 */
131 138
@@ -137,28 +144,57 @@ extern unsigned int vdso_enabled;
137#define elf_check_arch(x) \ 144#define elf_check_arch(x) \
138 ((x)->e_machine == EM_X86_64) 145 ((x)->e_machine == EM_X86_64)
139 146
147#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
148
149static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
150{
151 asm volatile("movl %0,%%fs" :: "r" (0));
152 asm volatile("movl %0,%%es; movl %0,%%ds" : : "r" (__USER32_DS));
153 load_gs_index(0);
154 regs->ip = ip;
155 regs->sp = sp;
156 regs->flags = X86_EFLAGS_IF;
157 regs->cs = __USER32_CS;
158 regs->ss = __USER32_DS;
159}
160
161static inline void elf_common_init(struct thread_struct *t,
162 struct pt_regs *regs, const u16 ds)
163{
164 regs->ax = regs->bx = regs->cx = regs->dx = 0;
165 regs->si = regs->di = regs->bp = 0;
166 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
167 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
168 t->fs = t->gs = 0;
169 t->fsindex = t->gsindex = 0;
170 t->ds = t->es = ds;
171}
172
140#define ELF_PLAT_INIT(_r, load_addr) do { \ 173#define ELF_PLAT_INIT(_r, load_addr) do { \
141 struct task_struct *cur = current; \ 174 elf_common_init(&current->thread, _r, 0); \
142 (_r)->rbx = 0; (_r)->rcx = 0; (_r)->rdx = 0; \
143 (_r)->rsi = 0; (_r)->rdi = 0; (_r)->rbp = 0; \
144 (_r)->rax = 0; \
145 (_r)->r8 = 0; \
146 (_r)->r9 = 0; \
147 (_r)->r10 = 0; \
148 (_r)->r11 = 0; \
149 (_r)->r12 = 0; \
150 (_r)->r13 = 0; \
151 (_r)->r14 = 0; \
152 (_r)->r15 = 0; \
153 cur->thread.fs = 0; cur->thread.gs = 0; \
154 cur->thread.fsindex = 0; cur->thread.gsindex = 0; \
155 cur->thread.ds = 0; cur->thread.es = 0; \
156 clear_thread_flag(TIF_IA32); \ 175 clear_thread_flag(TIF_IA32); \
157} while (0) 176} while (0)
158 177
159/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
160 now struct_user_regs, they are different). Assumes current is the process 179 elf_common_init(&current->thread, regs, __USER_DS)
161 getting dumped. */ 180#define compat_start_thread(regs, ip, sp) do { \
181 start_ia32_thread(regs, ip, sp); \
182 set_fs(USER_DS); \
183 } while (0)
184#define COMPAT_SET_PERSONALITY(ex, ibcs2) do { \
185 if (test_thread_flag(TIF_IA32)) \
186 clear_thread_flag(TIF_ABI_PENDING); \
187 else \
188 set_thread_flag(TIF_ABI_PENDING); \
189 current->personality |= force_personality32; \
190 } while (0)
191#define COMPAT_ELF_PLATFORM ("i686")
192
193/*
194 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
195 * now struct_user_regs, they are different). Assumes current is the process
196 * getting dumped.
197 */
162 198
163#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ 199#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
164 unsigned v; \ 200 unsigned v; \
@@ -166,22 +202,22 @@ extern unsigned int vdso_enabled;
166 (pr_reg)[1] = (regs)->r14; \ 202 (pr_reg)[1] = (regs)->r14; \
167 (pr_reg)[2] = (regs)->r13; \ 203 (pr_reg)[2] = (regs)->r13; \
168 (pr_reg)[3] = (regs)->r12; \ 204 (pr_reg)[3] = (regs)->r12; \
169 (pr_reg)[4] = (regs)->rbp; \ 205 (pr_reg)[4] = (regs)->bp; \
170 (pr_reg)[5] = (regs)->rbx; \ 206 (pr_reg)[5] = (regs)->bx; \
171 (pr_reg)[6] = (regs)->r11; \ 207 (pr_reg)[6] = (regs)->r11; \
172 (pr_reg)[7] = (regs)->r10; \ 208 (pr_reg)[7] = (regs)->r10; \
173 (pr_reg)[8] = (regs)->r9; \ 209 (pr_reg)[8] = (regs)->r9; \
174 (pr_reg)[9] = (regs)->r8; \ 210 (pr_reg)[9] = (regs)->r8; \
175 (pr_reg)[10] = (regs)->rax; \ 211 (pr_reg)[10] = (regs)->ax; \
176 (pr_reg)[11] = (regs)->rcx; \ 212 (pr_reg)[11] = (regs)->cx; \
177 (pr_reg)[12] = (regs)->rdx; \ 213 (pr_reg)[12] = (regs)->dx; \
178 (pr_reg)[13] = (regs)->rsi; \ 214 (pr_reg)[13] = (regs)->si; \
179 (pr_reg)[14] = (regs)->rdi; \ 215 (pr_reg)[14] = (regs)->di; \
180 (pr_reg)[15] = (regs)->orig_rax; \ 216 (pr_reg)[15] = (regs)->orig_ax; \
181 (pr_reg)[16] = (regs)->rip; \ 217 (pr_reg)[16] = (regs)->ip; \
182 (pr_reg)[17] = (regs)->cs; \ 218 (pr_reg)[17] = (regs)->cs; \
183 (pr_reg)[18] = (regs)->eflags; \ 219 (pr_reg)[18] = (regs)->flags; \
184 (pr_reg)[19] = (regs)->rsp; \ 220 (pr_reg)[19] = (regs)->sp; \
185 (pr_reg)[20] = (regs)->ss; \ 221 (pr_reg)[20] = (regs)->ss; \
186 (pr_reg)[21] = current->thread.fs; \ 222 (pr_reg)[21] = current->thread.fs; \
187 (pr_reg)[22] = current->thread.gs; \ 223 (pr_reg)[22] = current->thread.gs; \
@@ -189,15 +225,17 @@ extern unsigned int vdso_enabled;
189 asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ 225 asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
190 asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ 226 asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
191 asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \ 227 asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
192} while(0); 228} while (0);
193 229
194/* I'm not sure if we can use '-' here */ 230/* I'm not sure if we can use '-' here */
195#define ELF_PLATFORM ("x86_64") 231#define ELF_PLATFORM ("x86_64")
196extern void set_personality_64bit(void); 232extern void set_personality_64bit(void);
197extern int vdso_enabled; 233extern unsigned int sysctl_vsyscall32;
234extern int force_personality32;
198 235
199#endif /* !CONFIG_X86_32 */ 236#endif /* !CONFIG_X86_32 */
200 237
238#define CORE_DUMP_USE_REGSET
201#define USE_ELF_CORE_DUMP 239#define USE_ELF_CORE_DUMP
202#define ELF_EXEC_PAGESIZE 4096 240#define ELF_EXEC_PAGESIZE 4096
203 241
@@ -232,43 +270,24 @@ extern int vdso_enabled;
232 270
233struct task_struct; 271struct task_struct;
234 272
235extern int dump_task_regs (struct task_struct *, elf_gregset_t *); 273#define ARCH_DLINFO_IA32(vdso_enabled) \
236extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *); 274do if (vdso_enabled) { \
237 275 NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
238#define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs) 276 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
239#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) 277} while (0)
240 278
241#ifdef CONFIG_X86_32 279#ifdef CONFIG_X86_32
242extern int dump_task_extended_fpu (struct task_struct *,
243 struct user_fxsr_struct *);
244#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) \
245 dump_task_extended_fpu(tsk, elf_xfpregs)
246#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
247 280
248#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) 281#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
249#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
250#define VDSO_PRELINK 0
251
252#define VDSO_SYM(x) \
253 (VDSO_CURRENT_BASE + (unsigned long)(x) - VDSO_PRELINK)
254
255#define VDSO_HIGH_EHDR ((const struct elfhdr *) VDSO_HIGH_BASE)
256#define VDSO_EHDR ((const struct elfhdr *) VDSO_CURRENT_BASE)
257 282
258extern void __kernel_vsyscall; 283#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
259
260#define VDSO_ENTRY VDSO_SYM(&__kernel_vsyscall)
261 284
262/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ 285/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
263 286
264#define ARCH_DLINFO \
265do if (vdso_enabled) { \
266 NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
267 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
268} while (0)
269
270#else /* CONFIG_X86_32 */ 287#else /* CONFIG_X86_32 */
271 288
289#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
290
272/* 1GB for 64bit, 8MB for 32bit */ 291/* 1GB for 64bit, 8MB for 32bit */
273#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff) 292#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
274 293
@@ -277,14 +296,31 @@ do if (vdso_enabled) { \
277 NEW_AUX_ENT(AT_SYSINFO_EHDR,(unsigned long)current->mm->context.vdso);\ 296 NEW_AUX_ENT(AT_SYSINFO_EHDR,(unsigned long)current->mm->context.vdso);\
278} while (0) 297} while (0)
279 298
299#define AT_SYSINFO 32
300
301#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
302
303#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
304
280#endif /* !CONFIG_X86_32 */ 305#endif /* !CONFIG_X86_32 */
281 306
307#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
308
309#define VDSO_ENTRY \
310 ((unsigned long) VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
311
282struct linux_binprm; 312struct linux_binprm;
283 313
284#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 314#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
285extern int arch_setup_additional_pages(struct linux_binprm *bprm, 315extern int arch_setup_additional_pages(struct linux_binprm *bprm,
286 int executable_stack); 316 int executable_stack);
287 317
318extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
319#define compat_arch_setup_additional_pages syscall32_setup_pages
320
321extern unsigned long arch_randomize_brk(struct mm_struct *mm);
322#define arch_randomize_brk arch_randomize_brk
323
288#endif /* __KERNEL__ */ 324#endif /* __KERNEL__ */
289 325
290#endif 326#endif
diff --git a/include/asm-x86/emergency-restart.h b/include/asm-x86/emergency-restart.h
index 680c39563345..8e6aef19f8f0 100644
--- a/include/asm-x86/emergency-restart.h
+++ b/include/asm-x86/emergency-restart.h
@@ -1,6 +1,18 @@
1#ifndef _ASM_EMERGENCY_RESTART_H 1#ifndef _ASM_EMERGENCY_RESTART_H
2#define _ASM_EMERGENCY_RESTART_H 2#define _ASM_EMERGENCY_RESTART_H
3 3
4enum reboot_type {
5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e'
12};
13
14extern enum reboot_type reboot_type;
15
4extern void machine_emergency_restart(void); 16extern void machine_emergency_restart(void);
5 17
6#endif /* _ASM_EMERGENCY_RESTART_H */ 18#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h
index 249e753ac805..a7404d50686b 100644
--- a/include/asm-x86/fixmap_32.h
+++ b/include/asm-x86/fixmap_32.h
@@ -65,7 +65,7 @@ enum fixed_addresses {
65#endif 65#endif
66#ifdef CONFIG_X86_VISWS_APIC 66#ifdef CONFIG_X86_VISWS_APIC
67 FIX_CO_CPU, /* Cobalt timer */ 67 FIX_CO_CPU, /* Cobalt timer */
68 FIX_CO_APIC, /* Cobalt APIC Redirection Table */ 68 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
69 FIX_LI_PCIA, /* Lithium PCI Bridge A */ 69 FIX_LI_PCIA, /* Lithium PCI Bridge A */
70 FIX_LI_PCIB, /* Lithium PCI Bridge B */ 70 FIX_LI_PCIB, /* Lithium PCI Bridge B */
71#endif 71#endif
@@ -74,7 +74,7 @@ enum fixed_addresses {
74#endif 74#endif
75#ifdef CONFIG_X86_CYCLONE_TIMER 75#ifdef CONFIG_X86_CYCLONE_TIMER
76 FIX_CYCLONE_TIMER, /*cyclone timer register*/ 76 FIX_CYCLONE_TIMER, /*cyclone timer register*/
77#endif 77#endif
78#ifdef CONFIG_HIGHMEM 78#ifdef CONFIG_HIGHMEM
79 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ 79 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
80 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, 80 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -90,11 +90,23 @@ enum fixed_addresses {
90 FIX_PARAVIRT_BOOTMAP, 90 FIX_PARAVIRT_BOOTMAP,
91#endif 91#endif
92 __end_of_permanent_fixed_addresses, 92 __end_of_permanent_fixed_addresses,
93 /* temporary boot-time mappings, used before ioremap() is functional */ 93 /*
94#define NR_FIX_BTMAPS 16 94 * 256 temporary boot-time mappings, used by early_ioremap(),
95 FIX_BTMAP_END = __end_of_permanent_fixed_addresses, 95 * before ioremap() is functional.
96 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, 96 *
97 * We round it up to the next 512 pages boundary so that we
98 * can have a single pgd entry and a single pte table:
99 */
100#define NR_FIX_BTMAPS 64
101#define FIX_BTMAPS_NESTING 4
102 FIX_BTMAP_END =
103 __end_of_permanent_fixed_addresses + 512 -
104 (__end_of_permanent_fixed_addresses & 511),
105 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
97 FIX_WP_TEST, 106 FIX_WP_TEST,
107#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
108 FIX_OHCI1394_BASE,
109#endif
98 __end_of_fixed_addresses 110 __end_of_fixed_addresses
99}; 111};
100 112
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index cdfbe4a6ae6f..70ddb21e6458 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -15,6 +15,7 @@
15#include <asm/apicdef.h> 15#include <asm/apicdef.h>
16#include <asm/page.h> 16#include <asm/page.h>
17#include <asm/vsyscall.h> 17#include <asm/vsyscall.h>
18#include <asm/efi.h>
18 19
19/* 20/*
20 * Here we define all the compile-time 'special' virtual 21 * Here we define all the compile-time 'special' virtual
@@ -41,6 +42,11 @@ enum fixed_addresses {
41 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ 42 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
42 FIX_IO_APIC_BASE_0, 43 FIX_IO_APIC_BASE_0,
43 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, 44 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
45 FIX_EFI_IO_MAP_LAST_PAGE,
46 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
47#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
48 FIX_OHCI1394_BASE,
49#endif
44 __end_of_fixed_addresses 50 __end_of_fixed_addresses
45}; 51};
46 52
diff --git a/include/asm-x86/fpu32.h b/include/asm-x86/fpu32.h
deleted file mode 100644
index 4153db5c0c31..000000000000
--- a/include/asm-x86/fpu32.h
+++ /dev/null
@@ -1,10 +0,0 @@
1#ifndef _FPU32_H
2#define _FPU32_H 1
3
4struct _fpstate_ia32;
5
6int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave);
7int save_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf,
8 struct pt_regs *regs, int fsave);
9
10#endif
diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h
index 1f4610e0c613..62828d63f1b1 100644
--- a/include/asm-x86/futex.h
+++ b/include/asm-x86/futex.h
@@ -1,5 +1,135 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_FUTEX_H
2# include "futex_32.h" 2#define _ASM_X86_FUTEX_H
3#else 3
4# include "futex_64.h" 4#ifdef __KERNEL__
5
6#include <linux/futex.h>
7
8#include <asm/asm.h>
9#include <asm/errno.h>
10#include <asm/processor.h>
11#include <asm/system.h>
12#include <asm/uaccess.h>
13
14#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
15 __asm__ __volatile( \
16"1: " insn "\n" \
17"2: .section .fixup,\"ax\"\n \
183: mov %3, %1\n \
19 jmp 2b\n \
20 .previous\n \
21 .section __ex_table,\"a\"\n \
22 .align 8\n" \
23 _ASM_PTR "1b,3b\n \
24 .previous" \
25 : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
26 : "i" (-EFAULT), "0" (oparg), "1" (0))
27
28#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
29 __asm__ __volatile( \
30"1: movl %2, %0\n \
31 movl %0, %3\n" \
32 insn "\n" \
33"2: " LOCK_PREFIX "cmpxchgl %3, %2\n \
34 jnz 1b\n \
353: .section .fixup,\"ax\"\n \
364: mov %5, %1\n \
37 jmp 3b\n \
38 .previous\n \
39 .section __ex_table,\"a\"\n \
40 .align 8\n" \
41 _ASM_PTR "1b,4b,2b,4b\n \
42 .previous" \
43 : "=&a" (oldval), "=&r" (ret), "+m" (*uaddr), \
44 "=&r" (tem) \
45 : "r" (oparg), "i" (-EFAULT), "1" (0))
46
47static inline int
48futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
49{
50 int op = (encoded_op >> 28) & 7;
51 int cmp = (encoded_op >> 24) & 15;
52 int oparg = (encoded_op << 8) >> 20;
53 int cmparg = (encoded_op << 20) >> 20;
54 int oldval = 0, ret, tem;
55
56 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
57 oparg = 1 << oparg;
58
59 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
60 return -EFAULT;
61
62#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
63 /* Real i386 machines can only support FUTEX_OP_SET */
64 if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
65 return -ENOSYS;
66#endif
67
68 pagefault_disable();
69
70 switch (op) {
71 case FUTEX_OP_SET:
72 __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
73 break;
74 case FUTEX_OP_ADD:
75 __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
76 uaddr, oparg);
77 break;
78 case FUTEX_OP_OR:
79 __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
80 break;
81 case FUTEX_OP_ANDN:
82 __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
83 break;
84 case FUTEX_OP_XOR:
85 __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
86 break;
87 default:
88 ret = -ENOSYS;
89 }
90
91 pagefault_enable();
92
93 if (!ret) {
94 switch (cmp) {
95 case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
96 case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
97 case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
98 case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
99 case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
100 case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
101 default: ret = -ENOSYS;
102 }
103 }
104 return ret;
105}
106
107static inline int
108futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
109{
110 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
111 return -EFAULT;
112
113 __asm__ __volatile__(
114 "1: " LOCK_PREFIX "cmpxchgl %3, %1 \n"
115
116 "2: .section .fixup, \"ax\" \n"
117 "3: mov %2, %0 \n"
118 " jmp 2b \n"
119 " .previous \n"
120
121 " .section __ex_table, \"a\" \n"
122 " .align 8 \n"
123 _ASM_PTR " 1b,3b \n"
124 " .previous \n"
125
126 : "=a" (oldval), "+m" (*uaddr)
127 : "i" (-EFAULT), "r" (newval), "0" (oldval)
128 : "memory"
129 );
130
131 return oldval;
132}
133
134#endif
5#endif 135#endif
diff --git a/include/asm-x86/futex_32.h b/include/asm-x86/futex_32.h
deleted file mode 100644
index 438ef0ec7101..000000000000
--- a/include/asm-x86/futex_32.h
+++ /dev/null
@@ -1,135 +0,0 @@
1#ifndef _ASM_FUTEX_H
2#define _ASM_FUTEX_H
3
4#ifdef __KERNEL__
5
6#include <linux/futex.h>
7#include <asm/errno.h>
8#include <asm/system.h>
9#include <asm/processor.h>
10#include <asm/uaccess.h>
11
12#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
13 __asm__ __volatile ( \
14"1: " insn "\n" \
15"2: .section .fixup,\"ax\"\n\
163: mov %3, %1\n\
17 jmp 2b\n\
18 .previous\n\
19 .section __ex_table,\"a\"\n\
20 .align 8\n\
21 .long 1b,3b\n\
22 .previous" \
23 : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
24 : "i" (-EFAULT), "0" (oparg), "1" (0))
25
26#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
27 __asm__ __volatile ( \
28"1: movl %2, %0\n\
29 movl %0, %3\n" \
30 insn "\n" \
31"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
32 jnz 1b\n\
333: .section .fixup,\"ax\"\n\
344: mov %5, %1\n\
35 jmp 3b\n\
36 .previous\n\
37 .section __ex_table,\"a\"\n\
38 .align 8\n\
39 .long 1b,4b,2b,4b\n\
40 .previous" \
41 : "=&a" (oldval), "=&r" (ret), "+m" (*uaddr), \
42 "=&r" (tem) \
43 : "r" (oparg), "i" (-EFAULT), "1" (0))
44
45static inline int
46futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
47{
48 int op = (encoded_op >> 28) & 7;
49 int cmp = (encoded_op >> 24) & 15;
50 int oparg = (encoded_op << 8) >> 20;
51 int cmparg = (encoded_op << 20) >> 20;
52 int oldval = 0, ret, tem;
53 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
54 oparg = 1 << oparg;
55
56 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
57 return -EFAULT;
58
59 pagefault_disable();
60
61 if (op == FUTEX_OP_SET)
62 __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
63 else {
64#ifndef CONFIG_X86_BSWAP
65 if (boot_cpu_data.x86 == 3)
66 ret = -ENOSYS;
67 else
68#endif
69 switch (op) {
70 case FUTEX_OP_ADD:
71 __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
72 oldval, uaddr, oparg);
73 break;
74 case FUTEX_OP_OR:
75 __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr,
76 oparg);
77 break;
78 case FUTEX_OP_ANDN:
79 __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr,
80 ~oparg);
81 break;
82 case FUTEX_OP_XOR:
83 __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr,
84 oparg);
85 break;
86 default:
87 ret = -ENOSYS;
88 }
89 }
90
91 pagefault_enable();
92
93 if (!ret) {
94 switch (cmp) {
95 case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
96 case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
97 case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
98 case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
99 case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
100 case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
101 default: ret = -ENOSYS;
102 }
103 }
104 return ret;
105}
106
107static inline int
108futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
109{
110 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
111 return -EFAULT;
112
113 __asm__ __volatile__(
114 "1: " LOCK_PREFIX "cmpxchgl %3, %1 \n"
115
116 "2: .section .fixup, \"ax\" \n"
117 "3: mov %2, %0 \n"
118 " jmp 2b \n"
119 " .previous \n"
120
121 " .section __ex_table, \"a\" \n"
122 " .align 8 \n"
123 " .long 1b,3b \n"
124 " .previous \n"
125
126 : "=a" (oldval), "+m" (*uaddr)
127 : "i" (-EFAULT), "r" (newval), "0" (oldval)
128 : "memory"
129 );
130
131 return oldval;
132}
133
134#endif
135#endif
diff --git a/include/asm-x86/futex_64.h b/include/asm-x86/futex_64.h
deleted file mode 100644
index 5cdfb08013c3..000000000000
--- a/include/asm-x86/futex_64.h
+++ /dev/null
@@ -1,125 +0,0 @@
1#ifndef _ASM_FUTEX_H
2#define _ASM_FUTEX_H
3
4#ifdef __KERNEL__
5
6#include <linux/futex.h>
7#include <asm/errno.h>
8#include <asm/system.h>
9#include <asm/uaccess.h>
10
11#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
12 __asm__ __volatile ( \
13"1: " insn "\n" \
14"2: .section .fixup,\"ax\"\n\
153: mov %3, %1\n\
16 jmp 2b\n\
17 .previous\n\
18 .section __ex_table,\"a\"\n\
19 .align 8\n\
20 .quad 1b,3b\n\
21 .previous" \
22 : "=r" (oldval), "=r" (ret), "=m" (*uaddr) \
23 : "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
24
25#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
26 __asm__ __volatile ( \
27"1: movl %2, %0\n\
28 movl %0, %3\n" \
29 insn "\n" \
30"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
31 jnz 1b\n\
323: .section .fixup,\"ax\"\n\
334: mov %5, %1\n\
34 jmp 3b\n\
35 .previous\n\
36 .section __ex_table,\"a\"\n\
37 .align 8\n\
38 .quad 1b,4b,2b,4b\n\
39 .previous" \
40 : "=&a" (oldval), "=&r" (ret), "=m" (*uaddr), \
41 "=&r" (tem) \
42 : "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
43
44static inline int
45futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
46{
47 int op = (encoded_op >> 28) & 7;
48 int cmp = (encoded_op >> 24) & 15;
49 int oparg = (encoded_op << 8) >> 20;
50 int cmparg = (encoded_op << 20) >> 20;
51 int oldval = 0, ret, tem;
52 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
53 oparg = 1 << oparg;
54
55 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
56 return -EFAULT;
57
58 pagefault_disable();
59
60 switch (op) {
61 case FUTEX_OP_SET:
62 __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
63 break;
64 case FUTEX_OP_ADD:
65 __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
66 uaddr, oparg);
67 break;
68 case FUTEX_OP_OR:
69 __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
70 break;
71 case FUTEX_OP_ANDN:
72 __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
73 break;
74 case FUTEX_OP_XOR:
75 __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
76 break;
77 default:
78 ret = -ENOSYS;
79 }
80
81 pagefault_enable();
82
83 if (!ret) {
84 switch (cmp) {
85 case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
86 case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
87 case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
88 case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
89 case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
90 case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
91 default: ret = -ENOSYS;
92 }
93 }
94 return ret;
95}
96
97static inline int
98futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
99{
100 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
101 return -EFAULT;
102
103 __asm__ __volatile__(
104 "1: " LOCK_PREFIX "cmpxchgl %3, %1 \n"
105
106 "2: .section .fixup, \"ax\" \n"
107 "3: mov %2, %0 \n"
108 " jmp 2b \n"
109 " .previous \n"
110
111 " .section __ex_table, \"a\" \n"
112 " .align 8 \n"
113 " .quad 1b,3b \n"
114 " .previous \n"
115
116 : "=a" (oldval), "=m" (*uaddr)
117 : "i" (-EFAULT), "r" (newval), "0" (oldval)
118 : "memory"
119 );
120
121 return oldval;
122}
123
124#endif
125#endif
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index f704c50519b8..90958ed993fa 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -9,6 +9,7 @@ extern int iommu_detected;
9extern void gart_iommu_init(void); 9extern void gart_iommu_init(void);
10extern void gart_iommu_shutdown(void); 10extern void gart_iommu_shutdown(void);
11extern void __init gart_parse_options(char *); 11extern void __init gart_parse_options(char *);
12extern void early_gart_iommu_check(void);
12extern void gart_iommu_hole_init(void); 13extern void gart_iommu_hole_init(void);
13extern int fallback_aper_order; 14extern int fallback_aper_order;
14extern int fallback_aper_force; 15extern int fallback_aper_force;
@@ -20,6 +21,10 @@ extern int fix_aperture;
20#define gart_iommu_aperture 0 21#define gart_iommu_aperture 0
21#define gart_iommu_aperture_allowed 0 22#define gart_iommu_aperture_allowed 0
22 23
24static inline void early_gart_iommu_check(void)
25{
26}
27
23static inline void gart_iommu_shutdown(void) 28static inline void gart_iommu_shutdown(void)
24{ 29{
25} 30}
diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h
index 771af336734f..811fe14f70b2 100644
--- a/include/asm-x86/geode.h
+++ b/include/asm-x86/geode.h
@@ -121,9 +121,15 @@ extern int geode_get_dev_base(unsigned int dev);
121#define GPIO_MAP_Z 0xE8 121#define GPIO_MAP_Z 0xE8
122#define GPIO_MAP_W 0xEC 122#define GPIO_MAP_W 0xEC
123 123
124extern void geode_gpio_set(unsigned int, unsigned int); 124static inline u32 geode_gpio(unsigned int nr)
125extern void geode_gpio_clear(unsigned int, unsigned int); 125{
126extern int geode_gpio_isset(unsigned int, unsigned int); 126 BUG_ON(nr > 28);
127 return 1 << nr;
128}
129
130extern void geode_gpio_set(u32, unsigned int);
131extern void geode_gpio_clear(u32, unsigned int);
132extern int geode_gpio_isset(u32, unsigned int);
127extern void geode_gpio_setup_event(unsigned int, int, int); 133extern void geode_gpio_setup_event(unsigned int, int, int);
128extern void geode_gpio_set_irq(unsigned int, unsigned int); 134extern void geode_gpio_set_irq(unsigned int, unsigned int);
129 135
diff --git a/include/asm-x86/gpio.h b/include/asm-x86/gpio.h
new file mode 100644
index 000000000000..ff87fca0caf9
--- /dev/null
+++ b/include/asm-x86/gpio.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_I386_GPIO_H
2#define _ASM_I386_GPIO_H
3
4#include <gpio.h>
5
6#endif /* _ASM_I386_GPIO_H */
diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h
index ad8d6e758785..6a9b4ac59bf7 100644
--- a/include/asm-x86/hpet.h
+++ b/include/asm-x86/hpet.h
@@ -69,6 +69,7 @@ extern void force_hpet_resume(void);
69 69
70#include <linux/interrupt.h> 70#include <linux/interrupt.h>
71 71
72typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
72extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); 73extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
73extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); 74extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
74extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, 75extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
@@ -77,13 +78,16 @@ extern int hpet_set_periodic_freq(unsigned long freq);
77extern int hpet_rtc_dropped_irq(void); 78extern int hpet_rtc_dropped_irq(void);
78extern int hpet_rtc_timer_init(void); 79extern int hpet_rtc_timer_init(void);
79extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); 80extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
81extern int hpet_register_irq_handler(rtc_irq_handler handler);
82extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
80 83
81#endif /* CONFIG_HPET_EMULATE_RTC */ 84#endif /* CONFIG_HPET_EMULATE_RTC */
82 85
83#else 86#else /* CONFIG_HPET_TIMER */
84 87
85static inline int hpet_enable(void) { return 0; } 88static inline int hpet_enable(void) { return 0; }
86static inline unsigned long hpet_readl(unsigned long a) { return 0; } 89static inline unsigned long hpet_readl(unsigned long a) { return 0; }
90static inline int is_hpet_enabled(void) { return 0; }
87 91
88#endif /* CONFIG_HPET_TIMER */ 92#endif
89#endif /* ASM_X86_HPET_H */ 93#endif /* ASM_X86_HPET_H */
diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h
index 0bedbdf5e907..6d65fbb6358b 100644
--- a/include/asm-x86/hw_irq_32.h
+++ b/include/asm-x86/hw_irq_32.h
@@ -26,19 +26,19 @@
26 * Interrupt entry/exit code at both C and assembly level 26 * Interrupt entry/exit code at both C and assembly level
27 */ 27 */
28 28
29extern void (*interrupt[NR_IRQS])(void); 29extern void (*const interrupt[NR_IRQS])(void);
30 30
31#ifdef CONFIG_SMP 31#ifdef CONFIG_SMP
32fastcall void reschedule_interrupt(void); 32void reschedule_interrupt(void);
33fastcall void invalidate_interrupt(void); 33void invalidate_interrupt(void);
34fastcall void call_function_interrupt(void); 34void call_function_interrupt(void);
35#endif 35#endif
36 36
37#ifdef CONFIG_X86_LOCAL_APIC 37#ifdef CONFIG_X86_LOCAL_APIC
38fastcall void apic_timer_interrupt(void); 38void apic_timer_interrupt(void);
39fastcall void error_interrupt(void); 39void error_interrupt(void);
40fastcall void spurious_interrupt(void); 40void spurious_interrupt(void);
41fastcall void thermal_interrupt(void); 41void thermal_interrupt(void);
42#define platform_legacy_irq(irq) ((irq) < 16) 42#define platform_legacy_irq(irq) ((irq) < 16)
43#endif 43#endif
44 44
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index a470d59da678..312a58d6dac6 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -135,11 +135,13 @@ extern void init_8259A(int aeoi);
135extern void send_IPI_self(int vector); 135extern void send_IPI_self(int vector);
136extern void init_VISWS_APIC_irqs(void); 136extern void init_VISWS_APIC_irqs(void);
137extern void setup_IO_APIC(void); 137extern void setup_IO_APIC(void);
138extern void enable_IO_APIC(void);
138extern void disable_IO_APIC(void); 139extern void disable_IO_APIC(void);
139extern void print_IO_APIC(void); 140extern void print_IO_APIC(void);
140extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); 141extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
141extern void send_IPI(int dest, int vector); 142extern void send_IPI(int dest, int vector);
142extern void setup_ioapic_dest(void); 143extern void setup_ioapic_dest(void);
144extern void native_init_IRQ(void);
143 145
144extern unsigned long io_apic_irqs; 146extern unsigned long io_apic_irqs;
145 147
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index a8bbed349664..ba8105ca822b 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -1,5 +1,360 @@
1#ifdef CONFIG_X86_32 1/*
2# include "i387_32.h" 2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 * x86-64 work by Andi Kleen 2002
8 */
9
10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H
12
13#include <linux/sched.h>
14#include <linux/kernel_stat.h>
15#include <linux/regset.h>
16#include <asm/processor.h>
17#include <asm/sigcontext.h>
18#include <asm/user.h>
19#include <asm/uaccess.h>
20
21extern void fpu_init(void);
22extern unsigned int mxcsr_feature_mask;
23extern void mxcsr_feature_mask_init(void);
24extern void init_fpu(struct task_struct *child);
25extern asmlinkage void math_state_restore(void);
26
27extern user_regset_active_fn fpregs_active, xfpregs_active;
28extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
29extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
30
31#ifdef CONFIG_IA32_EMULATION
32struct _fpstate_ia32;
33extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
34extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
35#endif
36
37#ifdef CONFIG_X86_64
38
39/* Ignore delayed exceptions from user space */
40static inline void tolerant_fwait(void)
41{
42 asm volatile("1: fwait\n"
43 "2:\n"
44 " .section __ex_table,\"a\"\n"
45 " .align 8\n"
46 " .quad 1b,2b\n"
47 " .previous\n");
48}
49
50static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
51{
52 int err;
53
54 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
55 "2:\n"
56 ".section .fixup,\"ax\"\n"
57 "3: movl $-1,%[err]\n"
58 " jmp 2b\n"
59 ".previous\n"
60 ".section __ex_table,\"a\"\n"
61 " .align 8\n"
62 " .quad 1b,3b\n"
63 ".previous"
64 : [err] "=r" (err)
65#if 0 /* See comment in __save_init_fpu() below. */
66 : [fx] "r" (fx), "m" (*fx), "0" (0));
67#else
68 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
69#endif
70 if (unlikely(err))
71 init_fpu(current);
72 return err;
73}
74
75#define X87_FSW_ES (1 << 7) /* Exception Summary */
76
77/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
78 is pending. Clear the x87 state here by setting it to fixed
79 values. The kernel data segment can be sometimes 0 and sometimes
80 new user value. Both should be ok.
81 Use the PDA as safe address because it should be already in L1. */
82static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
83{
84 if (unlikely(fx->swd & X87_FSW_ES))
85 asm volatile("fnclex");
86 alternative_input(ASM_NOP8 ASM_NOP2,
87 " emms\n" /* clear stack tags */
88 " fildl %%gs:0", /* load to clear state */
89 X86_FEATURE_FXSAVE_LEAK);
90}
91
92static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
93{
94 int err;
95
96 asm volatile("1: rex64/fxsave (%[fx])\n\t"
97 "2:\n"
98 ".section .fixup,\"ax\"\n"
99 "3: movl $-1,%[err]\n"
100 " jmp 2b\n"
101 ".previous\n"
102 ".section __ex_table,\"a\"\n"
103 " .align 8\n"
104 " .quad 1b,3b\n"
105 ".previous"
106 : [err] "=r" (err), "=m" (*fx)
107#if 0 /* See comment in __fxsave_clear() below. */
108 : [fx] "r" (fx), "0" (0));
109#else
110 : [fx] "cdaSDb" (fx), "0" (0));
111#endif
112 if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
113 err = -EFAULT;
114 /* No need to clear here because the caller clears USED_MATH */
115 return err;
116}
117
118static inline void __save_init_fpu(struct task_struct *tsk)
119{
120 /* Using "rex64; fxsave %0" is broken because, if the memory operand
121 uses any extended registers for addressing, a second REX prefix
122 will be generated (to the assembler, rex64 followed by semicolon
123 is a separate instruction), and hence the 64-bitness is lost. */
124#if 0
125 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
126 starting with gas 2.16. */
127 __asm__ __volatile__("fxsaveq %0"
128 : "=m" (tsk->thread.i387.fxsave));
129#elif 0
130 /* Using, as a workaround, the properly prefixed form below isn't
131 accepted by any binutils version so far released, complaining that
132 the same type of prefix is used twice if an extended register is
133 needed for addressing (fix submitted to mainline 2005-11-21). */
134 __asm__ __volatile__("rex64/fxsave %0"
135 : "=m" (tsk->thread.i387.fxsave));
136#else
137 /* This, however, we can work around by forcing the compiler to select
138 an addressing mode that doesn't require extended registers. */
139 __asm__ __volatile__("rex64/fxsave %P2(%1)"
140 : "=m" (tsk->thread.i387.fxsave)
141 : "cdaSDb" (tsk),
142 "i" (offsetof(__typeof__(*tsk),
143 thread.i387.fxsave)));
144#endif
145 clear_fpu_state(&tsk->thread.i387.fxsave);
146 task_thread_info(tsk)->status &= ~TS_USEDFPU;
147}
148
149/*
150 * Signal frame handlers.
151 */
152
153static inline int save_i387(struct _fpstate __user *buf)
154{
155 struct task_struct *tsk = current;
156 int err = 0;
157
158 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
159 sizeof(tsk->thread.i387.fxsave));
160
161 if ((unsigned long)buf % 16)
162 printk("save_i387: bad fpstate %p\n", buf);
163
164 if (!used_math())
165 return 0;
166 clear_used_math(); /* trigger finit */
167 if (task_thread_info(tsk)->status & TS_USEDFPU) {
168 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
169 if (err) return err;
170 task_thread_info(tsk)->status &= ~TS_USEDFPU;
171 stts();
172 } else {
173 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
174 sizeof(struct i387_fxsave_struct)))
175 return -1;
176 }
177 return 1;
178}
179
180/*
181 * This restores directly out of user space. Exceptions are handled.
182 */
183static inline int restore_i387(struct _fpstate __user *buf)
184{
185 set_used_math();
186 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
187 clts();
188 task_thread_info(current)->status |= TS_USEDFPU;
189 }
190 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
191}
192
193#else /* CONFIG_X86_32 */
194
195static inline void tolerant_fwait(void)
196{
197 asm volatile("fnclex ; fwait");
198}
199
200static inline void restore_fpu(struct task_struct *tsk)
201{
202 /*
203 * The "nop" is needed to make the instructions the same
204 * length.
205 */
206 alternative_input(
207 "nop ; frstor %1",
208 "fxrstor %1",
209 X86_FEATURE_FXSR,
210 "m" ((tsk)->thread.i387.fxsave));
211}
212
213/* We need a safe address that is cheap to find and that is already
214 in L1 during context switch. The best choices are unfortunately
215 different for UP and SMP */
216#ifdef CONFIG_SMP
217#define safe_address (__per_cpu_offset[0])
3#else 218#else
4# include "i387_64.h" 219#define safe_address (kstat_cpu(0).cpustat.user)
5#endif 220#endif
221
222/*
223 * These must be called with preempt disabled
224 */
225static inline void __save_init_fpu(struct task_struct *tsk)
226{
227 /* Use more nops than strictly needed in case the compiler
228 varies code */
229 alternative_input(
230 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
231 "fxsave %[fx]\n"
232 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
233 X86_FEATURE_FXSR,
234 [fx] "m" (tsk->thread.i387.fxsave),
235 [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
236 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
237 is pending. Clear the x87 state here by setting it to fixed
238 values. safe_address is a random variable that should be in L1 */
239 alternative_input(
240 GENERIC_NOP8 GENERIC_NOP2,
241 "emms\n\t" /* clear stack tags */
242 "fildl %[addr]", /* set F?P to defined value */
243 X86_FEATURE_FXSAVE_LEAK,
244 [addr] "m" (safe_address));
245 task_thread_info(tsk)->status &= ~TS_USEDFPU;
246}
247
248/*
249 * Signal frame handlers...
250 */
251extern int save_i387(struct _fpstate __user *buf);
252extern int restore_i387(struct _fpstate __user *buf);
253
254#endif /* CONFIG_X86_64 */
255
256static inline void __unlazy_fpu(struct task_struct *tsk)
257{
258 if (task_thread_info(tsk)->status & TS_USEDFPU) {
259 __save_init_fpu(tsk);
260 stts();
261 } else
262 tsk->fpu_counter = 0;
263}
264
265static inline void __clear_fpu(struct task_struct *tsk)
266{
267 if (task_thread_info(tsk)->status & TS_USEDFPU) {
268 tolerant_fwait();
269 task_thread_info(tsk)->status &= ~TS_USEDFPU;
270 stts();
271 }
272}
273
274static inline void kernel_fpu_begin(void)
275{
276 struct thread_info *me = current_thread_info();
277 preempt_disable();
278 if (me->status & TS_USEDFPU)
279 __save_init_fpu(me->task);
280 else
281 clts();
282}
283
284static inline void kernel_fpu_end(void)
285{
286 stts();
287 preempt_enable();
288}
289
290#ifdef CONFIG_X86_64
291
292static inline void save_init_fpu(struct task_struct *tsk)
293{
294 __save_init_fpu(tsk);
295 stts();
296}
297
298#define unlazy_fpu __unlazy_fpu
299#define clear_fpu __clear_fpu
300
301#else /* CONFIG_X86_32 */
302
303/*
304 * These disable preemption on their own and are safe
305 */
306static inline void save_init_fpu(struct task_struct *tsk)
307{
308 preempt_disable();
309 __save_init_fpu(tsk);
310 stts();
311 preempt_enable();
312}
313
314static inline void unlazy_fpu(struct task_struct *tsk)
315{
316 preempt_disable();
317 __unlazy_fpu(tsk);
318 preempt_enable();
319}
320
321static inline void clear_fpu(struct task_struct *tsk)
322{
323 preempt_disable();
324 __clear_fpu(tsk);
325 preempt_enable();
326}
327
328#endif /* CONFIG_X86_64 */
329
330/*
331 * i387 state interaction
332 */
333static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
334{
335 if (cpu_has_fxsr) {
336 return tsk->thread.i387.fxsave.cwd;
337 } else {
338 return (unsigned short)tsk->thread.i387.fsave.cwd;
339 }
340}
341
342static inline unsigned short get_fpu_swd(struct task_struct *tsk)
343{
344 if (cpu_has_fxsr) {
345 return tsk->thread.i387.fxsave.swd;
346 } else {
347 return (unsigned short)tsk->thread.i387.fsave.swd;
348 }
349}
350
351static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
352{
353 if (cpu_has_xmm) {
354 return tsk->thread.i387.fxsave.mxcsr;
355 } else {
356 return MXCSR_DEFAULT;
357 }
358}
359
360#endif /* _ASM_X86_I387_H */
diff --git a/include/asm-x86/i387_32.h b/include/asm-x86/i387_32.h
deleted file mode 100644
index cdd1e248e3b4..000000000000
--- a/include/asm-x86/i387_32.h
+++ /dev/null
@@ -1,151 +0,0 @@
1/*
2 * include/asm-i386/i387.h
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * General FPU state handling cleanups
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11#ifndef __ASM_I386_I387_H
12#define __ASM_I386_I387_H
13
14#include <linux/sched.h>
15#include <linux/init.h>
16#include <linux/kernel_stat.h>
17#include <asm/processor.h>
18#include <asm/sigcontext.h>
19#include <asm/user.h>
20
21extern void mxcsr_feature_mask_init(void);
22extern void init_fpu(struct task_struct *);
23
24/*
25 * FPU lazy state save handling...
26 */
27
28/*
29 * The "nop" is needed to make the instructions the same
30 * length.
31 */
32#define restore_fpu(tsk) \
33 alternative_input( \
34 "nop ; frstor %1", \
35 "fxrstor %1", \
36 X86_FEATURE_FXSR, \
37 "m" ((tsk)->thread.i387.fxsave))
38
39extern void kernel_fpu_begin(void);
40#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
41
42/* We need a safe address that is cheap to find and that is already
43 in L1 during context switch. The best choices are unfortunately
44 different for UP and SMP */
45#ifdef CONFIG_SMP
46#define safe_address (__per_cpu_offset[0])
47#else
48#define safe_address (kstat_cpu(0).cpustat.user)
49#endif
50
51/*
52 * These must be called with preempt disabled
53 */
54static inline void __save_init_fpu( struct task_struct *tsk )
55{
56 /* Use more nops than strictly needed in case the compiler
57 varies code */
58 alternative_input(
59 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
60 "fxsave %[fx]\n"
61 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
62 X86_FEATURE_FXSR,
63 [fx] "m" (tsk->thread.i387.fxsave),
64 [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
65 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
66 is pending. Clear the x87 state here by setting it to fixed
67 values. safe_address is a random variable that should be in L1 */
68 alternative_input(
69 GENERIC_NOP8 GENERIC_NOP2,
70 "emms\n\t" /* clear stack tags */
71 "fildl %[addr]", /* set F?P to defined value */
72 X86_FEATURE_FXSAVE_LEAK,
73 [addr] "m" (safe_address));
74 task_thread_info(tsk)->status &= ~TS_USEDFPU;
75}
76
77#define __unlazy_fpu( tsk ) do { \
78 if (task_thread_info(tsk)->status & TS_USEDFPU) { \
79 __save_init_fpu(tsk); \
80 stts(); \
81 } else \
82 tsk->fpu_counter = 0; \
83} while (0)
84
85#define __clear_fpu( tsk ) \
86do { \
87 if (task_thread_info(tsk)->status & TS_USEDFPU) { \
88 asm volatile("fnclex ; fwait"); \
89 task_thread_info(tsk)->status &= ~TS_USEDFPU; \
90 stts(); \
91 } \
92} while (0)
93
94
95/*
96 * These disable preemption on their own and are safe
97 */
98static inline void save_init_fpu( struct task_struct *tsk )
99{
100 preempt_disable();
101 __save_init_fpu(tsk);
102 stts();
103 preempt_enable();
104}
105
106#define unlazy_fpu( tsk ) do { \
107 preempt_disable(); \
108 __unlazy_fpu(tsk); \
109 preempt_enable(); \
110} while (0)
111
112#define clear_fpu( tsk ) do { \
113 preempt_disable(); \
114 __clear_fpu( tsk ); \
115 preempt_enable(); \
116} while (0)
117
118/*
119 * FPU state interaction...
120 */
121extern unsigned short get_fpu_cwd( struct task_struct *tsk );
122extern unsigned short get_fpu_swd( struct task_struct *tsk );
123extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
124extern asmlinkage void math_state_restore(void);
125
126/*
127 * Signal frame handlers...
128 */
129extern int save_i387( struct _fpstate __user *buf );
130extern int restore_i387( struct _fpstate __user *buf );
131
132/*
133 * ptrace request handers...
134 */
135extern int get_fpregs( struct user_i387_struct __user *buf,
136 struct task_struct *tsk );
137extern int set_fpregs( struct task_struct *tsk,
138 struct user_i387_struct __user *buf );
139
140extern int get_fpxregs( struct user_fxsr_struct __user *buf,
141 struct task_struct *tsk );
142extern int set_fpxregs( struct task_struct *tsk,
143 struct user_fxsr_struct __user *buf );
144
145/*
146 * FPU state for core dumps...
147 */
148extern int dump_fpu( struct pt_regs *regs,
149 struct user_i387_struct *fpu );
150
151#endif /* __ASM_I386_I387_H */
diff --git a/include/asm-x86/i387_64.h b/include/asm-x86/i387_64.h
deleted file mode 100644
index 3a4ffba3d6bc..000000000000
--- a/include/asm-x86/i387_64.h
+++ /dev/null
@@ -1,214 +0,0 @@
1/*
2 * include/asm-x86_64/i387.h
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * General FPU state handling cleanups
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 * x86-64 work by Andi Kleen 2002
10 */
11
12#ifndef __ASM_X86_64_I387_H
13#define __ASM_X86_64_I387_H
14
15#include <linux/sched.h>
16#include <asm/processor.h>
17#include <asm/sigcontext.h>
18#include <asm/user.h>
19#include <asm/thread_info.h>
20#include <asm/uaccess.h>
21
22extern void fpu_init(void);
23extern unsigned int mxcsr_feature_mask;
24extern void mxcsr_feature_mask_init(void);
25extern void init_fpu(struct task_struct *child);
26extern int save_i387(struct _fpstate __user *buf);
27extern asmlinkage void math_state_restore(void);
28
29/*
30 * FPU lazy state save handling...
31 */
32
33#define unlazy_fpu(tsk) do { \
34 if (task_thread_info(tsk)->status & TS_USEDFPU) \
35 save_init_fpu(tsk); \
36 else \
37 tsk->fpu_counter = 0; \
38} while (0)
39
40/* Ignore delayed exceptions from user space */
41static inline void tolerant_fwait(void)
42{
43 asm volatile("1: fwait\n"
44 "2:\n"
45 " .section __ex_table,\"a\"\n"
46 " .align 8\n"
47 " .quad 1b,2b\n"
48 " .previous\n");
49}
50
51#define clear_fpu(tsk) do { \
52 if (task_thread_info(tsk)->status & TS_USEDFPU) { \
53 tolerant_fwait(); \
54 task_thread_info(tsk)->status &= ~TS_USEDFPU; \
55 stts(); \
56 } \
57} while (0)
58
59/*
60 * ptrace request handers...
61 */
62extern int get_fpregs(struct user_i387_struct __user *buf,
63 struct task_struct *tsk);
64extern int set_fpregs(struct task_struct *tsk,
65 struct user_i387_struct __user *buf);
66
67/*
68 * i387 state interaction
69 */
70#define get_fpu_mxcsr(t) ((t)->thread.i387.fxsave.mxcsr)
71#define get_fpu_cwd(t) ((t)->thread.i387.fxsave.cwd)
72#define get_fpu_fxsr_twd(t) ((t)->thread.i387.fxsave.twd)
73#define get_fpu_swd(t) ((t)->thread.i387.fxsave.swd)
74#define set_fpu_cwd(t,val) ((t)->thread.i387.fxsave.cwd = (val))
75#define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val))
76#define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val))
77
78#define X87_FSW_ES (1 << 7) /* Exception Summary */
79
80/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
81 is pending. Clear the x87 state here by setting it to fixed
82 values. The kernel data segment can be sometimes 0 and sometimes
83 new user value. Both should be ok.
84 Use the PDA as safe address because it should be already in L1. */
85static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
86{
87 if (unlikely(fx->swd & X87_FSW_ES))
88 asm volatile("fnclex");
89 alternative_input(ASM_NOP8 ASM_NOP2,
90 " emms\n" /* clear stack tags */
91 " fildl %%gs:0", /* load to clear state */
92 X86_FEATURE_FXSAVE_LEAK);
93}
94
95static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
96{
97 int err;
98
99 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
100 "2:\n"
101 ".section .fixup,\"ax\"\n"
102 "3: movl $-1,%[err]\n"
103 " jmp 2b\n"
104 ".previous\n"
105 ".section __ex_table,\"a\"\n"
106 " .align 8\n"
107 " .quad 1b,3b\n"
108 ".previous"
109 : [err] "=r" (err)
110#if 0 /* See comment in __fxsave_clear() below. */
111 : [fx] "r" (fx), "m" (*fx), "0" (0));
112#else
113 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
114#endif
115 if (unlikely(err))
116 init_fpu(current);
117 return err;
118}
119
120static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
121{
122 int err;
123
124 asm volatile("1: rex64/fxsave (%[fx])\n\t"
125 "2:\n"
126 ".section .fixup,\"ax\"\n"
127 "3: movl $-1,%[err]\n"
128 " jmp 2b\n"
129 ".previous\n"
130 ".section __ex_table,\"a\"\n"
131 " .align 8\n"
132 " .quad 1b,3b\n"
133 ".previous"
134 : [err] "=r" (err), "=m" (*fx)
135#if 0 /* See comment in __fxsave_clear() below. */
136 : [fx] "r" (fx), "0" (0));
137#else
138 : [fx] "cdaSDb" (fx), "0" (0));
139#endif
140 if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
141 err = -EFAULT;
142 /* No need to clear here because the caller clears USED_MATH */
143 return err;
144}
145
146static inline void __fxsave_clear(struct task_struct *tsk)
147{
148 /* Using "rex64; fxsave %0" is broken because, if the memory operand
149 uses any extended registers for addressing, a second REX prefix
150 will be generated (to the assembler, rex64 followed by semicolon
151 is a separate instruction), and hence the 64-bitness is lost. */
152#if 0
153 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
154 starting with gas 2.16. */
155 __asm__ __volatile__("fxsaveq %0"
156 : "=m" (tsk->thread.i387.fxsave));
157#elif 0
158 /* Using, as a workaround, the properly prefixed form below isn't
159 accepted by any binutils version so far released, complaining that
160 the same type of prefix is used twice if an extended register is
161 needed for addressing (fix submitted to mainline 2005-11-21). */
162 __asm__ __volatile__("rex64/fxsave %0"
163 : "=m" (tsk->thread.i387.fxsave));
164#else
165 /* This, however, we can work around by forcing the compiler to select
166 an addressing mode that doesn't require extended registers. */
167 __asm__ __volatile__("rex64/fxsave %P2(%1)"
168 : "=m" (tsk->thread.i387.fxsave)
169 : "cdaSDb" (tsk),
170 "i" (offsetof(__typeof__(*tsk),
171 thread.i387.fxsave)));
172#endif
173 clear_fpu_state(&tsk->thread.i387.fxsave);
174}
175
176static inline void kernel_fpu_begin(void)
177{
178 struct thread_info *me = current_thread_info();
179 preempt_disable();
180 if (me->status & TS_USEDFPU) {
181 __fxsave_clear(me->task);
182 me->status &= ~TS_USEDFPU;
183 return;
184 }
185 clts();
186}
187
188static inline void kernel_fpu_end(void)
189{
190 stts();
191 preempt_enable();
192}
193
194static inline void save_init_fpu(struct task_struct *tsk)
195{
196 __fxsave_clear(tsk);
197 task_thread_info(tsk)->status &= ~TS_USEDFPU;
198 stts();
199}
200
201/*
202 * This restores directly out of user space. Exceptions are handled.
203 */
204static inline int restore_i387(struct _fpstate __user *buf)
205{
206 set_used_math();
207 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
208 clts();
209 task_thread_info(current)->status |= TS_USEDFPU;
210 }
211 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
212}
213
214#endif /* __ASM_X86_64_I387_H */
diff --git a/include/asm-x86/i8253.h b/include/asm-x86/i8253.h
index 747548ec5d1d..b51c0487fc41 100644
--- a/include/asm-x86/i8253.h
+++ b/include/asm-x86/i8253.h
@@ -12,4 +12,7 @@ extern struct clock_event_device *global_clock_event;
12 12
13extern void setup_pit_timer(void); 13extern void setup_pit_timer(void);
14 14
15#define inb_pit inb_p
16#define outb_pit outb_p
17
15#endif /* __ASM_I8253_H__ */ 18#endif /* __ASM_I8253_H__ */
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index 29d8f9a6b3fc..67c319e0efc7 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -3,10 +3,25 @@
3 3
4extern unsigned int cached_irq_mask; 4extern unsigned int cached_irq_mask;
5 5
6#define __byte(x,y) (((unsigned char *) &(y))[x]) 6#define __byte(x,y) (((unsigned char *) &(y))[x])
7#define cached_master_mask (__byte(0, cached_irq_mask)) 7#define cached_master_mask (__byte(0, cached_irq_mask))
8#define cached_slave_mask (__byte(1, cached_irq_mask)) 8#define cached_slave_mask (__byte(1, cached_irq_mask))
9 9
10/* i8259A PIC registers */
11#define PIC_MASTER_CMD 0x20
12#define PIC_MASTER_IMR 0x21
13#define PIC_MASTER_ISR PIC_MASTER_CMD
14#define PIC_MASTER_POLL PIC_MASTER_ISR
15#define PIC_MASTER_OCW3 PIC_MASTER_ISR
16#define PIC_SLAVE_CMD 0xa0
17#define PIC_SLAVE_IMR 0xa1
18
19/* i8259A PIC related value */
20#define PIC_CASCADE_IR 2
21#define MASTER_ICW4_DEFAULT 0x01
22#define SLAVE_ICW4_DEFAULT 0x01
23#define PIC_ICW4_AEOI 2
24
10extern spinlock_t i8259A_lock; 25extern spinlock_t i8259A_lock;
11 26
12extern void init_8259A(int auto_eoi); 27extern void init_8259A(int auto_eoi);
@@ -14,4 +29,7 @@ extern void enable_8259A_irq(unsigned int irq);
14extern void disable_8259A_irq(unsigned int irq); 29extern void disable_8259A_irq(unsigned int irq);
15extern unsigned int startup_8259A_irq(unsigned int irq); 30extern unsigned int startup_8259A_irq(unsigned int irq);
16 31
32#define inb_pic inb_p
33#define outb_pic outb_p
34
17#endif /* __ASM_I8259_H__ */ 35#endif /* __ASM_I8259_H__ */
diff --git a/include/asm-x86/ia32.h b/include/asm-x86/ia32.h
index 0190b7c4e319..aa9733206e29 100644
--- a/include/asm-x86/ia32.h
+++ b/include/asm-x86/ia32.h
@@ -159,12 +159,6 @@ struct ustat32 {
159#define IA32_STACK_TOP IA32_PAGE_OFFSET 159#define IA32_STACK_TOP IA32_PAGE_OFFSET
160 160
161#ifdef __KERNEL__ 161#ifdef __KERNEL__
162struct user_desc;
163struct siginfo_t;
164int do_get_thread_area(struct thread_struct *t, struct user_desc __user *info);
165int do_set_thread_area(struct thread_struct *t, struct user_desc __user *info);
166int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs);
167
168struct linux_binprm; 162struct linux_binprm;
169extern int ia32_setup_arg_pages(struct linux_binprm *bprm, 163extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
170 unsigned long stack_top, int exec_stack); 164 unsigned long stack_top, int exec_stack);
diff --git a/include/asm-x86/ia32_unistd.h b/include/asm-x86/ia32_unistd.h
index 5b52ce507338..61cea9e7c5c1 100644
--- a/include/asm-x86/ia32_unistd.h
+++ b/include/asm-x86/ia32_unistd.h
@@ -5,7 +5,7 @@
5 * This file contains the system call numbers of the ia32 port, 5 * This file contains the system call numbers of the ia32 port,
6 * this is for the kernel only. 6 * this is for the kernel only.
7 * Only add syscalls here where some part of the kernel needs to know 7 * Only add syscalls here where some part of the kernel needs to know
8 * the number. This should be otherwise in sync with asm-i386/unistd.h. -AK 8 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
9 */ 9 */
10 10
11#define __NR_ia32_restart_syscall 0 11#define __NR_ia32_restart_syscall 0
diff --git a/include/asm-x86/ide.h b/include/asm-x86/ide.h
index 42130adf9c7c..c2552d8bebf7 100644
--- a/include/asm-x86/ide.h
+++ b/include/asm-x86/ide.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/include/asm-i386/ide.h
3 *
4 * Copyright (C) 1994-1996 Linus Torvalds & authors 2 * Copyright (C) 1994-1996 Linus Torvalds & authors
5 */ 3 */
6 4
diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h
index 6bd47dcf2067..d240e5b30a45 100644
--- a/include/asm-x86/idle.h
+++ b/include/asm-x86/idle.h
@@ -6,7 +6,6 @@
6 6
7struct notifier_block; 7struct notifier_block;
8void idle_notifier_register(struct notifier_block *n); 8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n);
10 9
11void enter_idle(void); 10void enter_idle(void);
12void exit_idle(void); 11void exit_idle(void);
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index fe881cd1e6f4..586d7aa54ceb 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -100,8 +100,6 @@ static inline void * phys_to_virt(unsigned long address)
100 */ 100 */
101#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) 101#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
102 102
103extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
104
105/** 103/**
106 * ioremap - map bus memory into CPU space 104 * ioremap - map bus memory into CPU space
107 * @offset: bus address of the memory 105 * @offset: bus address of the memory
@@ -111,32 +109,39 @@ extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsign
111 * make bus memory CPU accessible via the readb/readw/readl/writeb/ 109 * make bus memory CPU accessible via the readb/readw/readl/writeb/
112 * writew/writel functions and the other mmio helpers. The returned 110 * writew/writel functions and the other mmio helpers. The returned
113 * address is not guaranteed to be usable directly as a virtual 111 * address is not guaranteed to be usable directly as a virtual
114 * address. 112 * address.
115 * 113 *
116 * If the area you are trying to map is a PCI BAR you should have a 114 * If the area you are trying to map is a PCI BAR you should have a
117 * look at pci_iomap(). 115 * look at pci_iomap().
118 */ 116 */
117extern void __iomem *ioremap_nocache(unsigned long offset, unsigned long size);
118extern void __iomem *ioremap_cache(unsigned long offset, unsigned long size);
119 119
120static inline void __iomem * ioremap(unsigned long offset, unsigned long size) 120/*
121 * The default ioremap() behavior is non-cached:
122 */
123static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
121{ 124{
122 return __ioremap(offset, size, 0); 125 return ioremap_nocache(offset, size);
123} 126}
124 127
125extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
126extern void iounmap(volatile void __iomem *addr); 128extern void iounmap(volatile void __iomem *addr);
127 129
128/* 130/*
129 * bt_ioremap() and bt_iounmap() are for temporary early boot-time 131 * early_ioremap() and early_iounmap() are for temporary early boot-time
130 * mappings, before the real ioremap() is functional. 132 * mappings, before the real ioremap() is functional.
131 * A boot-time mapping is currently limited to at most 16 pages. 133 * A boot-time mapping is currently limited to at most 16 pages.
132 */ 134 */
133extern void *bt_ioremap(unsigned long offset, unsigned long size); 135extern void early_ioremap_init(void);
134extern void bt_iounmap(void *addr, unsigned long size); 136extern void early_ioremap_clear(void);
137extern void early_ioremap_reset(void);
138extern void *early_ioremap(unsigned long offset, unsigned long size);
139extern void early_iounmap(void *addr, unsigned long size);
135extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); 140extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
136 141
137/* Use early IO mappings for DMI because it's initialized early */ 142/* Use early IO mappings for DMI because it's initialized early */
138#define dmi_ioremap bt_ioremap 143#define dmi_ioremap early_ioremap
139#define dmi_iounmap bt_iounmap 144#define dmi_iounmap early_iounmap
140#define dmi_alloc alloc_bootmem 145#define dmi_alloc alloc_bootmem
141 146
142/* 147/*
@@ -250,10 +255,10 @@ static inline void flush_write_buffers(void)
250 255
251#endif /* __KERNEL__ */ 256#endif /* __KERNEL__ */
252 257
253static inline void native_io_delay(void) 258extern void native_io_delay(void);
254{ 259
255 asm volatile("outb %%al,$0x80" : : : "memory"); 260extern int io_delay_type;
256} 261extern void io_delay_init(void);
257 262
258#if defined(CONFIG_PARAVIRT) 263#if defined(CONFIG_PARAVIRT)
259#include <asm/paravirt.h> 264#include <asm/paravirt.h>
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index a037b0794332..f64a59cc396d 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,12 +35,24 @@
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> 35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */ 36 */
37 37
38#define __SLOW_DOWN_IO "\noutb %%al,$0x80" 38extern void native_io_delay(void);
39 39
40#ifdef REALLY_SLOW_IO 40extern int io_delay_type;
41#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO 41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
42#else 45#else
43#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO 46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
44#endif 56#endif
45 57
46/* 58/*
@@ -52,9 +64,15 @@ static inline void out##s(unsigned x value, unsigned short port) {
52#define __OUT2(s,s1,s2) \ 64#define __OUT2(s,s1,s2) \
53__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" 65__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
54 66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
55#define __OUT(s,s1,x) \ 72#define __OUT(s,s1,x) \
56__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ 73__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
57__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \ 74__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
75 slow_down_io(); }
58 76
59#define __IN1(s) \ 77#define __IN1(s) \
60static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; 78static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
@@ -63,8 +81,13 @@ static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
63__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" 81__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
64 82
65#define __IN(s,s1,i...) \ 83#define __IN(s,s1,i...) \
66__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ 84__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
67__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ 85__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
86 slow_down_io(); return _v; }
87
88#ifdef UNSET_REALLY_SLOW_IO
89#undef REALLY_SLOW_IO
90#endif
68 91
69#define __INS(s) \ 92#define __INS(s) \
70static inline void ins##s(unsigned short port, void * addr, unsigned long count) \ 93static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
@@ -127,13 +150,6 @@ static inline void * phys_to_virt(unsigned long address)
127 150
128#include <asm-generic/iomap.h> 151#include <asm-generic/iomap.h>
129 152
130extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
131
132static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
133{
134 return __ioremap(offset, size, 0);
135}
136
137extern void *early_ioremap(unsigned long addr, unsigned long size); 153extern void *early_ioremap(unsigned long addr, unsigned long size);
138extern void early_iounmap(void *addr, unsigned long size); 154extern void early_iounmap(void *addr, unsigned long size);
139 155
@@ -142,8 +158,19 @@ extern void early_iounmap(void *addr, unsigned long size);
142 * it's useful if some control registers are in such an area and write combining 158 * it's useful if some control registers are in such an area and write combining
143 * or read caching is not desirable: 159 * or read caching is not desirable:
144 */ 160 */
145extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); 161extern void __iomem *ioremap_nocache(unsigned long offset, unsigned long size);
162extern void __iomem *ioremap_cache(unsigned long offset, unsigned long size);
163
164/*
165 * The default ioremap() behavior is non-cached:
166 */
167static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
168{
169 return ioremap_nocache(offset, size);
170}
171
146extern void iounmap(volatile void __iomem *addr); 172extern void iounmap(volatile void __iomem *addr);
173
147extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); 174extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
148 175
149/* 176/*
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index 88494966beeb..0f5b3fef0b08 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -1,5 +1,159 @@
1#ifndef __ASM_IO_APIC_H
2#define __ASM_IO_APIC_H
3
4#include <asm/types.h>
5#include <asm/mpspec.h>
6#include <asm/apicdef.h>
7
8/*
9 * Intel IO-APIC support for SMP and UP systems.
10 *
11 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
12 */
13
14/*
15 * The structure of the IO-APIC:
16 */
17union IO_APIC_reg_00 {
18 u32 raw;
19 struct {
20 u32 __reserved_2 : 14,
21 LTS : 1,
22 delivery_type : 1,
23 __reserved_1 : 8,
24 ID : 8;
25 } __attribute__ ((packed)) bits;
26};
27
28union IO_APIC_reg_01 {
29 u32 raw;
30 struct {
31 u32 version : 8,
32 __reserved_2 : 7,
33 PRQ : 1,
34 entries : 8,
35 __reserved_1 : 8;
36 } __attribute__ ((packed)) bits;
37};
38
39union IO_APIC_reg_02 {
40 u32 raw;
41 struct {
42 u32 __reserved_2 : 24,
43 arbitration : 4,
44 __reserved_1 : 4;
45 } __attribute__ ((packed)) bits;
46};
47
48union IO_APIC_reg_03 {
49 u32 raw;
50 struct {
51 u32 boot_DT : 1,
52 __reserved_1 : 31;
53 } __attribute__ ((packed)) bits;
54};
55
56enum ioapic_irq_destination_types {
57 dest_Fixed = 0,
58 dest_LowestPrio = 1,
59 dest_SMI = 2,
60 dest__reserved_1 = 3,
61 dest_NMI = 4,
62 dest_INIT = 5,
63 dest__reserved_2 = 6,
64 dest_ExtINT = 7
65};
66
67struct IO_APIC_route_entry {
68 __u32 vector : 8,
69 delivery_mode : 3, /* 000: FIXED
70 * 001: lowest prio
71 * 111: ExtINT
72 */
73 dest_mode : 1, /* 0: physical, 1: logical */
74 delivery_status : 1,
75 polarity : 1,
76 irr : 1,
77 trigger : 1, /* 0: edge, 1: level */
78 mask : 1, /* 0: enabled, 1: disabled */
79 __reserved_2 : 15;
80
1#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
2# include "io_apic_32.h" 82 union {
83 struct {
84 __u32 __reserved_1 : 24,
85 physical_dest : 4,
86 __reserved_2 : 4;
87 } physical;
88
89 struct {
90 __u32 __reserved_1 : 24,
91 logical_dest : 8;
92 } logical;
93 } dest;
3#else 94#else
4# include "io_apic_64.h" 95 __u32 __reserved_3 : 24,
96 dest : 8;
97#endif
98
99} __attribute__ ((packed));
100
101#ifdef CONFIG_X86_IO_APIC
102
103/*
104 * # of IO-APICs and # of IRQ routing registers
105 */
106extern int nr_ioapics;
107extern int nr_ioapic_registers[MAX_IO_APICS];
108
109/*
110 * MP-BIOS irq configuration table structures:
111 */
112
113/* I/O APIC entries */
114extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
115
116/* # of MP IRQ source entries */
117extern int mp_irq_entries;
118
119/* MP IRQ source entries */
120extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
121
122/* non-0 if default (table-less) MP configuration */
123extern int mpc_default_type;
124
125/* Older SiS APIC requires we rewrite the index register */
126extern int sis_apic_bug;
127
128/* 1 if "noapic" boot option passed */
129extern int skip_ioapic_setup;
130
131static inline void disable_ioapic_setup(void)
132{
133 skip_ioapic_setup = 1;
134}
135
136/*
137 * If we use the IO-APIC for IRQ routing, disable automatic
138 * assignment of PCI IRQ's.
139 */
140#define io_apic_assign_pci_irqs \
141 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
142
143#ifdef CONFIG_ACPI
144extern int io_apic_get_unique_id(int ioapic, int apic_id);
145extern int io_apic_get_version(int ioapic);
146extern int io_apic_get_redir_entries(int ioapic);
147extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
148 int edge_level, int active_high_low);
149extern int timer_uses_ioapic_pin_0;
150#endif /* CONFIG_ACPI */
151
152extern int (*ioapic_renumber_irq)(int ioapic, int irq);
153extern void ioapic_init_mappings(void);
154
155#else /* !CONFIG_X86_IO_APIC */
156#define io_apic_assign_pci_irqs 0
157#endif
158
5#endif 159#endif
diff --git a/include/asm-x86/io_apic_32.h b/include/asm-x86/io_apic_32.h
deleted file mode 100644
index 3f087883ea48..000000000000
--- a/include/asm-x86/io_apic_32.h
+++ /dev/null
@@ -1,155 +0,0 @@
1#ifndef __ASM_IO_APIC_H
2#define __ASM_IO_APIC_H
3
4#include <asm/types.h>
5#include <asm/mpspec.h>
6#include <asm/apicdef.h>
7
8/*
9 * Intel IO-APIC support for SMP and UP systems.
10 *
11 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
12 */
13
14/*
15 * The structure of the IO-APIC:
16 */
17union IO_APIC_reg_00 {
18 u32 raw;
19 struct {
20 u32 __reserved_2 : 14,
21 LTS : 1,
22 delivery_type : 1,
23 __reserved_1 : 8,
24 ID : 8;
25 } __attribute__ ((packed)) bits;
26};
27
28union IO_APIC_reg_01 {
29 u32 raw;
30 struct {
31 u32 version : 8,
32 __reserved_2 : 7,
33 PRQ : 1,
34 entries : 8,
35 __reserved_1 : 8;
36 } __attribute__ ((packed)) bits;
37};
38
39union IO_APIC_reg_02 {
40 u32 raw;
41 struct {
42 u32 __reserved_2 : 24,
43 arbitration : 4,
44 __reserved_1 : 4;
45 } __attribute__ ((packed)) bits;
46};
47
48union IO_APIC_reg_03 {
49 u32 raw;
50 struct {
51 u32 boot_DT : 1,
52 __reserved_1 : 31;
53 } __attribute__ ((packed)) bits;
54};
55
56enum ioapic_irq_destination_types {
57 dest_Fixed = 0,
58 dest_LowestPrio = 1,
59 dest_SMI = 2,
60 dest__reserved_1 = 3,
61 dest_NMI = 4,
62 dest_INIT = 5,
63 dest__reserved_2 = 6,
64 dest_ExtINT = 7
65};
66
67struct IO_APIC_route_entry {
68 __u32 vector : 8,
69 delivery_mode : 3, /* 000: FIXED
70 * 001: lowest prio
71 * 111: ExtINT
72 */
73 dest_mode : 1, /* 0: physical, 1: logical */
74 delivery_status : 1,
75 polarity : 1,
76 irr : 1,
77 trigger : 1, /* 0: edge, 1: level */
78 mask : 1, /* 0: enabled, 1: disabled */
79 __reserved_2 : 15;
80
81 union { struct { __u32
82 __reserved_1 : 24,
83 physical_dest : 4,
84 __reserved_2 : 4;
85 } physical;
86
87 struct { __u32
88 __reserved_1 : 24,
89 logical_dest : 8;
90 } logical;
91 } dest;
92
93} __attribute__ ((packed));
94
95#ifdef CONFIG_X86_IO_APIC
96
97/*
98 * # of IO-APICs and # of IRQ routing registers
99 */
100extern int nr_ioapics;
101extern int nr_ioapic_registers[MAX_IO_APICS];
102
103/*
104 * MP-BIOS irq configuration table structures:
105 */
106
107/* I/O APIC entries */
108extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
109
110/* # of MP IRQ source entries */
111extern int mp_irq_entries;
112
113/* MP IRQ source entries */
114extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
115
116/* non-0 if default (table-less) MP configuration */
117extern int mpc_default_type;
118
119/* Older SiS APIC requires we rewrite the index register */
120extern int sis_apic_bug;
121
122/* 1 if "noapic" boot option passed */
123extern int skip_ioapic_setup;
124
125static inline void disable_ioapic_setup(void)
126{
127 skip_ioapic_setup = 1;
128}
129
130static inline int ioapic_setup_disabled(void)
131{
132 return skip_ioapic_setup;
133}
134
135/*
136 * If we use the IO-APIC for IRQ routing, disable automatic
137 * assignment of PCI IRQ's.
138 */
139#define io_apic_assign_pci_irqs (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
140
141#ifdef CONFIG_ACPI
142extern int io_apic_get_unique_id (int ioapic, int apic_id);
143extern int io_apic_get_version (int ioapic);
144extern int io_apic_get_redir_entries (int ioapic);
145extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low);
146extern int timer_uses_ioapic_pin_0;
147#endif /* CONFIG_ACPI */
148
149extern int (*ioapic_renumber_irq)(int ioapic, int irq);
150
151#else /* !CONFIG_X86_IO_APIC */
152#define io_apic_assign_pci_irqs 0
153#endif
154
155#endif
diff --git a/include/asm-x86/io_apic_64.h b/include/asm-x86/io_apic_64.h
deleted file mode 100644
index e2c13675ee4e..000000000000
--- a/include/asm-x86/io_apic_64.h
+++ /dev/null
@@ -1,138 +0,0 @@
1#ifndef __ASM_IO_APIC_H
2#define __ASM_IO_APIC_H
3
4#include <asm/types.h>
5#include <asm/mpspec.h>
6#include <asm/apicdef.h>
7
8/*
9 * Intel IO-APIC support for SMP and UP systems.
10 *
11 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
12 */
13
14#define APIC_MISMATCH_DEBUG
15
16/*
17 * The structure of the IO-APIC:
18 */
19union IO_APIC_reg_00 {
20 u32 raw;
21 struct {
22 u32 __reserved_2 : 14,
23 LTS : 1,
24 delivery_type : 1,
25 __reserved_1 : 8,
26 ID : 8;
27 } __attribute__ ((packed)) bits;
28};
29
30union IO_APIC_reg_01 {
31 u32 raw;
32 struct {
33 u32 version : 8,
34 __reserved_2 : 7,
35 PRQ : 1,
36 entries : 8,
37 __reserved_1 : 8;
38 } __attribute__ ((packed)) bits;
39};
40
41union IO_APIC_reg_02 {
42 u32 raw;
43 struct {
44 u32 __reserved_2 : 24,
45 arbitration : 4,
46 __reserved_1 : 4;
47 } __attribute__ ((packed)) bits;
48};
49
50union IO_APIC_reg_03 {
51 u32 raw;
52 struct {
53 u32 boot_DT : 1,
54 __reserved_1 : 31;
55 } __attribute__ ((packed)) bits;
56};
57
58/*
59 * # of IO-APICs and # of IRQ routing registers
60 */
61extern int nr_ioapics;
62extern int nr_ioapic_registers[MAX_IO_APICS];
63
64enum ioapic_irq_destination_types {
65 dest_Fixed = 0,
66 dest_LowestPrio = 1,
67 dest_SMI = 2,
68 dest__reserved_1 = 3,
69 dest_NMI = 4,
70 dest_INIT = 5,
71 dest__reserved_2 = 6,
72 dest_ExtINT = 7
73};
74
75struct IO_APIC_route_entry {
76 __u32 vector : 8,
77 delivery_mode : 3, /* 000: FIXED
78 * 001: lowest prio
79 * 111: ExtINT
80 */
81 dest_mode : 1, /* 0: physical, 1: logical */
82 delivery_status : 1,
83 polarity : 1,
84 irr : 1,
85 trigger : 1, /* 0: edge, 1: level */
86 mask : 1, /* 0: enabled, 1: disabled */
87 __reserved_2 : 15;
88
89 __u32 __reserved_3 : 24,
90 dest : 8;
91} __attribute__ ((packed));
92
93/*
94 * MP-BIOS irq configuration table structures:
95 */
96
97/* I/O APIC entries */
98extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
99
100/* # of MP IRQ source entries */
101extern int mp_irq_entries;
102
103/* MP IRQ source entries */
104extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
105
106/* non-0 if default (table-less) MP configuration */
107extern int mpc_default_type;
108
109/* 1 if "noapic" boot option passed */
110extern int skip_ioapic_setup;
111
112static inline void disable_ioapic_setup(void)
113{
114 skip_ioapic_setup = 1;
115}
116
117
118/*
119 * If we use the IO-APIC for IRQ routing, disable automatic
120 * assignment of PCI IRQ's.
121 */
122#define io_apic_assign_pci_irqs (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
123
124#ifdef CONFIG_ACPI
125extern int io_apic_get_version (int ioapic);
126extern int io_apic_get_redir_entries (int ioapic);
127extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int, int);
128#endif
129
130extern int sis_apic_bug; /* dummy */
131
132void enable_NMI_through_LVT0 (void * dummy);
133
134extern spinlock_t i8259A_lock;
135
136extern int timer_over_8254;
137
138#endif
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index 1b695ff52687..92021c1ffa3a 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -1,5 +1,245 @@
1#ifdef CONFIG_X86_32 1#ifndef _X86_IRQFLAGS_H_
2# include "irqflags_32.h" 2#define _X86_IRQFLAGS_H_
3
4#include <asm/processor-flags.h>
5
6#ifndef __ASSEMBLY__
7/*
8 * Interrupt control:
9 */
10
11static inline unsigned long native_save_fl(void)
12{
13 unsigned long flags;
14
15 __asm__ __volatile__(
16 "# __raw_save_flags\n\t"
17 "pushf ; pop %0"
18 : "=g" (flags)
19 : /* no input */
20 : "memory"
21 );
22
23 return flags;
24}
25
26static inline void native_restore_fl(unsigned long flags)
27{
28 __asm__ __volatile__(
29 "push %0 ; popf"
30 : /* no output */
31 :"g" (flags)
32 :"memory", "cc"
33 );
34}
35
36static inline void native_irq_disable(void)
37{
38 asm volatile("cli": : :"memory");
39}
40
41static inline void native_irq_enable(void)
42{
43 asm volatile("sti": : :"memory");
44}
45
46static inline void native_safe_halt(void)
47{
48 asm volatile("sti; hlt": : :"memory");
49}
50
51static inline void native_halt(void)
52{
53 asm volatile("hlt": : :"memory");
54}
55
56#endif
57
58#ifdef CONFIG_PARAVIRT
59#include <asm/paravirt.h>
60#else
61#ifndef __ASSEMBLY__
62
63static inline unsigned long __raw_local_save_flags(void)
64{
65 return native_save_fl();
66}
67
68static inline void raw_local_irq_restore(unsigned long flags)
69{
70 native_restore_fl(flags);
71}
72
73static inline void raw_local_irq_disable(void)
74{
75 native_irq_disable();
76}
77
78static inline void raw_local_irq_enable(void)
79{
80 native_irq_enable();
81}
82
83/*
84 * Used in the idle loop; sti takes one instruction cycle
85 * to complete:
86 */
87static inline void raw_safe_halt(void)
88{
89 native_safe_halt();
90}
91
92/*
93 * Used when interrupts are already enabled or to
94 * shutdown the processor:
95 */
96static inline void halt(void)
97{
98 native_halt();
99}
100
101/*
102 * For spinlocks, etc:
103 */
104static inline unsigned long __raw_local_irq_save(void)
105{
106 unsigned long flags = __raw_local_save_flags();
107
108 raw_local_irq_disable();
109
110 return flags;
111}
112#else
113
114#define ENABLE_INTERRUPTS(x) sti
115#define DISABLE_INTERRUPTS(x) cli
116
117#ifdef CONFIG_X86_64
118#define INTERRUPT_RETURN iretq
119#define ENABLE_INTERRUPTS_SYSCALL_RET \
120 movq %gs:pda_oldrsp, %rsp; \
121 swapgs; \
122 sysretq;
123#else
124#define INTERRUPT_RETURN iret
125#define ENABLE_INTERRUPTS_SYSCALL_RET sti; sysexit
126#define GET_CR0_INTO_EAX movl %cr0, %eax
127#endif
128
129
130#endif /* __ASSEMBLY__ */
131#endif /* CONFIG_PARAVIRT */
132
133#ifndef __ASSEMBLY__
134#define raw_local_save_flags(flags) \
135 do { (flags) = __raw_local_save_flags(); } while (0)
136
137#define raw_local_irq_save(flags) \
138 do { (flags) = __raw_local_irq_save(); } while (0)
139
140static inline int raw_irqs_disabled_flags(unsigned long flags)
141{
142 return !(flags & X86_EFLAGS_IF);
143}
144
145static inline int raw_irqs_disabled(void)
146{
147 unsigned long flags = __raw_local_save_flags();
148
149 return raw_irqs_disabled_flags(flags);
150}
151
152/*
153 * makes the traced hardirq state match with the machine state
154 *
155 * should be a rarely used function, only in places where its
156 * otherwise impossible to know the irq state, like in traps.
157 */
158static inline void trace_hardirqs_fixup_flags(unsigned long flags)
159{
160 if (raw_irqs_disabled_flags(flags))
161 trace_hardirqs_off();
162 else
163 trace_hardirqs_on();
164}
165
166static inline void trace_hardirqs_fixup(void)
167{
168 unsigned long flags = __raw_local_save_flags();
169
170 trace_hardirqs_fixup_flags(flags);
171}
172
3#else 173#else
4# include "irqflags_64.h" 174
175#ifdef CONFIG_X86_64
176/*
177 * Currently paravirt can't handle swapgs nicely when we
178 * don't have a stack we can rely on (such as a user space
179 * stack). So we either find a way around these or just fault
180 * and emulate if a guest tries to call swapgs directly.
181 *
182 * Either way, this is a good way to document that we don't
183 * have a reliable stack. x86_64 only.
184 */
185#define SWAPGS_UNSAFE_STACK swapgs
186#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
187#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
188#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
189#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
190 TRACE_IRQS_ON; \
191 sti; \
192 SAVE_REST; \
193 LOCKDEP_SYS_EXIT; \
194 RESTORE_REST; \
195 cli; \
196 TRACE_IRQS_OFF;
197
198#else
199#define ARCH_TRACE_IRQS_ON \
200 pushl %eax; \
201 pushl %ecx; \
202 pushl %edx; \
203 call trace_hardirqs_on; \
204 popl %edx; \
205 popl %ecx; \
206 popl %eax;
207
208#define ARCH_TRACE_IRQS_OFF \
209 pushl %eax; \
210 pushl %ecx; \
211 pushl %edx; \
212 call trace_hardirqs_off; \
213 popl %edx; \
214 popl %ecx; \
215 popl %eax;
216
217#define ARCH_LOCKDEP_SYS_EXIT \
218 pushl %eax; \
219 pushl %ecx; \
220 pushl %edx; \
221 call lockdep_sys_exit; \
222 popl %edx; \
223 popl %ecx; \
224 popl %eax;
225
226#define ARCH_LOCKDEP_SYS_EXIT_IRQ
227#endif
228
229#ifdef CONFIG_TRACE_IRQFLAGS
230# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
231# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
232#else
233# define TRACE_IRQS_ON
234# define TRACE_IRQS_OFF
235#endif
236#ifdef CONFIG_DEBUG_LOCK_ALLOC
237# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
238# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
239# else
240# define LOCKDEP_SYS_EXIT
241# define LOCKDEP_SYS_EXIT_IRQ
242# endif
243
244#endif /* __ASSEMBLY__ */
5#endif 245#endif
diff --git a/include/asm-x86/irqflags_32.h b/include/asm-x86/irqflags_32.h
deleted file mode 100644
index 4c7720089cb5..000000000000
--- a/include/asm-x86/irqflags_32.h
+++ /dev/null
@@ -1,197 +0,0 @@
1/*
2 * include/asm-i386/irqflags.h
3 *
4 * IRQ flags handling
5 *
6 * This file gets included from lowlevel asm headers too, to provide
7 * wrapped versions of the local_irq_*() APIs, based on the
8 * raw_local_irq_*() functions from the lowlevel headers.
9 */
10#ifndef _ASM_IRQFLAGS_H
11#define _ASM_IRQFLAGS_H
12#include <asm/processor-flags.h>
13
14#ifndef __ASSEMBLY__
15static inline unsigned long native_save_fl(void)
16{
17 unsigned long f;
18 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
19 return f;
20}
21
22static inline void native_restore_fl(unsigned long f)
23{
24 asm volatile("pushl %0 ; popfl": /* no output */
25 :"g" (f)
26 :"memory", "cc");
27}
28
29static inline void native_irq_disable(void)
30{
31 asm volatile("cli": : :"memory");
32}
33
34static inline void native_irq_enable(void)
35{
36 asm volatile("sti": : :"memory");
37}
38
39static inline void native_safe_halt(void)
40{
41 asm volatile("sti; hlt": : :"memory");
42}
43
44static inline void native_halt(void)
45{
46 asm volatile("hlt": : :"memory");
47}
48#endif /* __ASSEMBLY__ */
49
50#ifdef CONFIG_PARAVIRT
51#include <asm/paravirt.h>
52#else
53#ifndef __ASSEMBLY__
54
55static inline unsigned long __raw_local_save_flags(void)
56{
57 return native_save_fl();
58}
59
60static inline void raw_local_irq_restore(unsigned long flags)
61{
62 native_restore_fl(flags);
63}
64
65static inline void raw_local_irq_disable(void)
66{
67 native_irq_disable();
68}
69
70static inline void raw_local_irq_enable(void)
71{
72 native_irq_enable();
73}
74
75/*
76 * Used in the idle loop; sti takes one instruction cycle
77 * to complete:
78 */
79static inline void raw_safe_halt(void)
80{
81 native_safe_halt();
82}
83
84/*
85 * Used when interrupts are already enabled or to
86 * shutdown the processor:
87 */
88static inline void halt(void)
89{
90 native_halt();
91}
92
93/*
94 * For spinlocks, etc:
95 */
96static inline unsigned long __raw_local_irq_save(void)
97{
98 unsigned long flags = __raw_local_save_flags();
99
100 raw_local_irq_disable();
101
102 return flags;
103}
104
105#else
106#define DISABLE_INTERRUPTS(clobbers) cli
107#define ENABLE_INTERRUPTS(clobbers) sti
108#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
109#define INTERRUPT_RETURN iret
110#define GET_CR0_INTO_EAX movl %cr0, %eax
111#endif /* __ASSEMBLY__ */
112#endif /* CONFIG_PARAVIRT */
113
114#ifndef __ASSEMBLY__
115#define raw_local_save_flags(flags) \
116 do { (flags) = __raw_local_save_flags(); } while (0)
117
118#define raw_local_irq_save(flags) \
119 do { (flags) = __raw_local_irq_save(); } while (0)
120
121static inline int raw_irqs_disabled_flags(unsigned long flags)
122{
123 return !(flags & X86_EFLAGS_IF);
124}
125
126static inline int raw_irqs_disabled(void)
127{
128 unsigned long flags = __raw_local_save_flags();
129
130 return raw_irqs_disabled_flags(flags);
131}
132
133/*
134 * makes the traced hardirq state match with the machine state
135 *
136 * should be a rarely used function, only in places where its
137 * otherwise impossible to know the irq state, like in traps.
138 */
139static inline void trace_hardirqs_fixup_flags(unsigned long flags)
140{
141 if (raw_irqs_disabled_flags(flags))
142 trace_hardirqs_off();
143 else
144 trace_hardirqs_on();
145}
146
147static inline void trace_hardirqs_fixup(void)
148{
149 unsigned long flags = __raw_local_save_flags();
150
151 trace_hardirqs_fixup_flags(flags);
152}
153#endif /* __ASSEMBLY__ */
154
155/*
156 * Do the CPU's IRQ-state tracing from assembly code. We call a
157 * C function, so save all the C-clobbered registers:
158 */
159#ifdef CONFIG_TRACE_IRQFLAGS
160
161# define TRACE_IRQS_ON \
162 pushl %eax; \
163 pushl %ecx; \
164 pushl %edx; \
165 call trace_hardirqs_on; \
166 popl %edx; \
167 popl %ecx; \
168 popl %eax;
169
170# define TRACE_IRQS_OFF \
171 pushl %eax; \
172 pushl %ecx; \
173 pushl %edx; \
174 call trace_hardirqs_off; \
175 popl %edx; \
176 popl %ecx; \
177 popl %eax;
178
179#else
180# define TRACE_IRQS_ON
181# define TRACE_IRQS_OFF
182#endif
183
184#ifdef CONFIG_DEBUG_LOCK_ALLOC
185# define LOCKDEP_SYS_EXIT \
186 pushl %eax; \
187 pushl %ecx; \
188 pushl %edx; \
189 call lockdep_sys_exit; \
190 popl %edx; \
191 popl %ecx; \
192 popl %eax;
193#else
194# define LOCKDEP_SYS_EXIT
195#endif
196
197#endif
diff --git a/include/asm-x86/irqflags_64.h b/include/asm-x86/irqflags_64.h
deleted file mode 100644
index bb9163bb29d1..000000000000
--- a/include/asm-x86/irqflags_64.h
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * include/asm-x86_64/irqflags.h
3 *
4 * IRQ flags handling
5 *
6 * This file gets included from lowlevel asm headers too, to provide
7 * wrapped versions of the local_irq_*() APIs, based on the
8 * raw_local_irq_*() functions from the lowlevel headers.
9 */
10#ifndef _ASM_IRQFLAGS_H
11#define _ASM_IRQFLAGS_H
12#include <asm/processor-flags.h>
13
14#ifndef __ASSEMBLY__
15/*
16 * Interrupt control:
17 */
18
19static inline unsigned long __raw_local_save_flags(void)
20{
21 unsigned long flags;
22
23 __asm__ __volatile__(
24 "# __raw_save_flags\n\t"
25 "pushfq ; popq %q0"
26 : "=g" (flags)
27 : /* no input */
28 : "memory"
29 );
30
31 return flags;
32}
33
34#define raw_local_save_flags(flags) \
35 do { (flags) = __raw_local_save_flags(); } while (0)
36
37static inline void raw_local_irq_restore(unsigned long flags)
38{
39 __asm__ __volatile__(
40 "pushq %0 ; popfq"
41 : /* no output */
42 :"g" (flags)
43 :"memory", "cc"
44 );
45}
46
47#ifdef CONFIG_X86_VSMP
48
49/*
50 * Interrupt control for the VSMP architecture:
51 */
52
53static inline void raw_local_irq_disable(void)
54{
55 unsigned long flags = __raw_local_save_flags();
56
57 raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
58}
59
60static inline void raw_local_irq_enable(void)
61{
62 unsigned long flags = __raw_local_save_flags();
63
64 raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
65}
66
67static inline int raw_irqs_disabled_flags(unsigned long flags)
68{
69 return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
70}
71
72#else /* CONFIG_X86_VSMP */
73
74static inline void raw_local_irq_disable(void)
75{
76 __asm__ __volatile__("cli" : : : "memory");
77}
78
79static inline void raw_local_irq_enable(void)
80{
81 __asm__ __volatile__("sti" : : : "memory");
82}
83
84static inline int raw_irqs_disabled_flags(unsigned long flags)
85{
86 return !(flags & X86_EFLAGS_IF);
87}
88
89#endif
90
91/*
92 * For spinlocks, etc.:
93 */
94
95static inline unsigned long __raw_local_irq_save(void)
96{
97 unsigned long flags = __raw_local_save_flags();
98
99 raw_local_irq_disable();
100
101 return flags;
102}
103
104#define raw_local_irq_save(flags) \
105 do { (flags) = __raw_local_irq_save(); } while (0)
106
107static inline int raw_irqs_disabled(void)
108{
109 unsigned long flags = __raw_local_save_flags();
110
111 return raw_irqs_disabled_flags(flags);
112}
113
114/*
115 * makes the traced hardirq state match with the machine state
116 *
117 * should be a rarely used function, only in places where its
118 * otherwise impossible to know the irq state, like in traps.
119 */
120static inline void trace_hardirqs_fixup_flags(unsigned long flags)
121{
122 if (raw_irqs_disabled_flags(flags))
123 trace_hardirqs_off();
124 else
125 trace_hardirqs_on();
126}
127
128static inline void trace_hardirqs_fixup(void)
129{
130 unsigned long flags = __raw_local_save_flags();
131
132 trace_hardirqs_fixup_flags(flags);
133}
134/*
135 * Used in the idle loop; sti takes one instruction cycle
136 * to complete:
137 */
138static inline void raw_safe_halt(void)
139{
140 __asm__ __volatile__("sti; hlt" : : : "memory");
141}
142
143/*
144 * Used when interrupts are already enabled or to
145 * shutdown the processor:
146 */
147static inline void halt(void)
148{
149 __asm__ __volatile__("hlt": : :"memory");
150}
151
152#else /* __ASSEMBLY__: */
153# ifdef CONFIG_TRACE_IRQFLAGS
154# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
155# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
156# else
157# define TRACE_IRQS_ON
158# define TRACE_IRQS_OFF
159# endif
160# ifdef CONFIG_DEBUG_LOCK_ALLOC
161# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
162# define LOCKDEP_SYS_EXIT_IRQ \
163 TRACE_IRQS_ON; \
164 sti; \
165 SAVE_REST; \
166 LOCKDEP_SYS_EXIT; \
167 RESTORE_REST; \
168 cli; \
169 TRACE_IRQS_OFF;
170# else
171# define LOCKDEP_SYS_EXIT
172# define LOCKDEP_SYS_EXIT_IRQ
173# endif
174#endif
175
176#endif
diff --git a/include/asm-x86/k8.h b/include/asm-x86/k8.h
index 699dd6961eda..452e2b696ff4 100644
--- a/include/asm-x86/k8.h
+++ b/include/asm-x86/k8.h
@@ -10,5 +10,6 @@ extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 10extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 11extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end);
13 14
14#endif 15#endif
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index e2f9b62e535e..dd442a1632c0 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,12 +22,17 @@ enum die_val {
22 DIE_PAGE_FAULT, 22 DIE_PAGE_FAULT,
23}; 23};
24 24
25extern void printk_address(unsigned long address); 25extern void printk_address(unsigned long address, int reliable);
26extern void die(const char *,struct pt_regs *,long); 26extern void die(const char *,struct pt_regs *,long);
27extern void __die(const char *,struct pt_regs *,long); 27extern int __must_check __die(const char *, struct pt_regs *, long);
28extern void show_registers(struct pt_regs *regs); 28extern void show_registers(struct pt_regs *regs);
29extern void __show_registers(struct pt_regs *, int all);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp);
32extern void __show_regs(struct pt_regs *regs);
33extern void show_regs(struct pt_regs *regs);
29extern void dump_pagetable(unsigned long); 34extern void dump_pagetable(unsigned long);
30extern unsigned long oops_begin(void); 35extern unsigned long oops_begin(void);
31extern void oops_end(unsigned long); 36extern void oops_end(unsigned long, struct pt_regs *, int signr);
32 37
33#endif 38#endif
diff --git a/include/asm-x86/kexec.h b/include/asm-x86/kexec.h
index 718ddbfb9516..c90d3c77afc2 100644
--- a/include/asm-x86/kexec.h
+++ b/include/asm-x86/kexec.h
@@ -1,5 +1,170 @@
1#ifndef _KEXEC_H
2#define _KEXEC_H
3
1#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
2# include "kexec_32.h" 5# define PA_CONTROL_PAGE 0
6# define VA_CONTROL_PAGE 1
7# define PA_PGD 2
8# define VA_PGD 3
9# define PA_PTE_0 4
10# define VA_PTE_0 5
11# define PA_PTE_1 6
12# define VA_PTE_1 7
13# ifdef CONFIG_X86_PAE
14# define PA_PMD_0 8
15# define VA_PMD_0 9
16# define PA_PMD_1 10
17# define VA_PMD_1 11
18# define PAGES_NR 12
19# else
20# define PAGES_NR 8
21# endif
3#else 22#else
4# include "kexec_64.h" 23# define PA_CONTROL_PAGE 0
24# define VA_CONTROL_PAGE 1
25# define PA_PGD 2
26# define VA_PGD 3
27# define PA_PUD_0 4
28# define VA_PUD_0 5
29# define PA_PMD_0 6
30# define VA_PMD_0 7
31# define PA_PTE_0 8
32# define VA_PTE_0 9
33# define PA_PUD_1 10
34# define VA_PUD_1 11
35# define PA_PMD_1 12
36# define VA_PMD_1 13
37# define PA_PTE_1 14
38# define VA_PTE_1 15
39# define PA_TABLE_PAGE 16
40# define PAGES_NR 17
5#endif 41#endif
42
43#ifndef __ASSEMBLY__
44
45#include <linux/string.h>
46
47#include <asm/page.h>
48#include <asm/ptrace.h>
49
50/*
51 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
52 * I.e. Maximum page that is mapped directly into kernel memory,
53 * and kmap is not required.
54 *
55 * So far x86_64 is limited to 40 physical address bits.
56 */
57#ifdef CONFIG_X86_32
58/* Maximum physical address we can use pages from */
59# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
60/* Maximum address we can reach in physical address mode */
61# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
62/* Maximum address we can use for the control code buffer */
63# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
64
65# define KEXEC_CONTROL_CODE_SIZE 4096
66
67/* The native architecture */
68# define KEXEC_ARCH KEXEC_ARCH_386
69
70/* We can also handle crash dumps from 64 bit kernel. */
71# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
72#else
73/* Maximum physical address we can use pages from */
74# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
75/* Maximum address we can reach in physical address mode */
76# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
77/* Maximum address we can use for the control pages */
78# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
79
80/* Allocate one page for the pdp and the second for the code */
81# define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL)
82
83/* The native architecture */
84# define KEXEC_ARCH KEXEC_ARCH_X86_64
85#endif
86
87/*
88 * CPU does not save ss and sp on stack if execution is already
89 * running in kernel mode at the time of NMI occurrence. This code
90 * fixes it.
91 */
92static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
93 struct pt_regs *oldregs)
94{
95#ifdef CONFIG_X86_32
96 newregs->sp = (unsigned long)&(oldregs->sp);
97 __asm__ __volatile__(
98 "xorl %%eax, %%eax\n\t"
99 "movw %%ss, %%ax\n\t"
100 :"=a"(newregs->ss));
101#endif
102}
103
104/*
105 * This function is responsible for capturing register states if coming
106 * via panic otherwise just fix up the ss and sp if coming via kernel
107 * mode exception.
108 */
109static inline void crash_setup_regs(struct pt_regs *newregs,
110 struct pt_regs *oldregs)
111{
112 if (oldregs) {
113 memcpy(newregs, oldregs, sizeof(*newregs));
114 crash_fixup_ss_esp(newregs, oldregs);
115 } else {
116#ifdef CONFIG_X86_32
117 __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->bx));
118 __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->cx));
119 __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->dx));
120 __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->si));
121 __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->di));
122 __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->bp));
123 __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->ax));
124 __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->sp));
125 __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
126 __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
127 __asm__ __volatile__("movl %%ds, %%eax;" :"=a"(newregs->ds));
128 __asm__ __volatile__("movl %%es, %%eax;" :"=a"(newregs->es));
129 __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->flags));
130#else
131 __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->bx));
132 __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->cx));
133 __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->dx));
134 __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->si));
135 __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->di));
136 __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->bp));
137 __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->ax));
138 __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->sp));
139 __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
140 __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
141 __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
142 __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
143 __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
144 __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
145 __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
146 __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
147 __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
148 __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
149 __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->flags));
150#endif
151 newregs->ip = (unsigned long)current_text_addr();
152 }
153}
154
155#ifdef CONFIG_X86_32
156asmlinkage NORET_TYPE void
157relocate_kernel(unsigned long indirection_page,
158 unsigned long control_page,
159 unsigned long start_address,
160 unsigned int has_pae) ATTRIB_NORET;
161#else
162NORET_TYPE void
163relocate_kernel(unsigned long indirection_page,
164 unsigned long page_list,
165 unsigned long start_address) ATTRIB_NORET;
166#endif
167
168#endif /* __ASSEMBLY__ */
169
170#endif /* _KEXEC_H */
diff --git a/include/asm-x86/kexec_32.h b/include/asm-x86/kexec_32.h
deleted file mode 100644
index 4b9dc9e6b701..000000000000
--- a/include/asm-x86/kexec_32.h
+++ /dev/null
@@ -1,99 +0,0 @@
1#ifndef _I386_KEXEC_H
2#define _I386_KEXEC_H
3
4#define PA_CONTROL_PAGE 0
5#define VA_CONTROL_PAGE 1
6#define PA_PGD 2
7#define VA_PGD 3
8#define PA_PTE_0 4
9#define VA_PTE_0 5
10#define PA_PTE_1 6
11#define VA_PTE_1 7
12#ifdef CONFIG_X86_PAE
13#define PA_PMD_0 8
14#define VA_PMD_0 9
15#define PA_PMD_1 10
16#define VA_PMD_1 11
17#define PAGES_NR 12
18#else
19#define PAGES_NR 8
20#endif
21
22#ifndef __ASSEMBLY__
23
24#include <asm/ptrace.h>
25#include <asm/string.h>
26
27/*
28 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
29 * I.e. Maximum page that is mapped directly into kernel memory,
30 * and kmap is not required.
31 */
32
33/* Maximum physical address we can use pages from */
34#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
35/* Maximum address we can reach in physical address mode */
36#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
37/* Maximum address we can use for the control code buffer */
38#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
39
40#define KEXEC_CONTROL_CODE_SIZE 4096
41
42/* The native architecture */
43#define KEXEC_ARCH KEXEC_ARCH_386
44
45/* We can also handle crash dumps from 64 bit kernel. */
46#define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
47
48/* CPU does not save ss and esp on stack if execution is already
49 * running in kernel mode at the time of NMI occurrence. This code
50 * fixes it.
51 */
52static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
53 struct pt_regs *oldregs)
54{
55 memcpy(newregs, oldregs, sizeof(*newregs));
56 newregs->esp = (unsigned long)&(oldregs->esp);
57 __asm__ __volatile__(
58 "xorl %%eax, %%eax\n\t"
59 "movw %%ss, %%ax\n\t"
60 :"=a"(newregs->xss));
61}
62
63/*
64 * This function is responsible for capturing register states if coming
65 * via panic otherwise just fix up the ss and esp if coming via kernel
66 * mode exception.
67 */
68static inline void crash_setup_regs(struct pt_regs *newregs,
69 struct pt_regs *oldregs)
70{
71 if (oldregs)
72 crash_fixup_ss_esp(newregs, oldregs);
73 else {
74 __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
75 __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
76 __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
77 __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
78 __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
79 __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
80 __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
81 __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
82 __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
83 __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
84 __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
85 __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
86 __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
87
88 newregs->eip = (unsigned long)current_text_addr();
89 }
90}
91asmlinkage NORET_TYPE void
92relocate_kernel(unsigned long indirection_page,
93 unsigned long control_page,
94 unsigned long start_address,
95 unsigned int has_pae) ATTRIB_NORET;
96
97#endif /* __ASSEMBLY__ */
98
99#endif /* _I386_KEXEC_H */
diff --git a/include/asm-x86/kexec_64.h b/include/asm-x86/kexec_64.h
deleted file mode 100644
index 738e581b67f8..000000000000
--- a/include/asm-x86/kexec_64.h
+++ /dev/null
@@ -1,94 +0,0 @@
1#ifndef _X86_64_KEXEC_H
2#define _X86_64_KEXEC_H
3
4#define PA_CONTROL_PAGE 0
5#define VA_CONTROL_PAGE 1
6#define PA_PGD 2
7#define VA_PGD 3
8#define PA_PUD_0 4
9#define VA_PUD_0 5
10#define PA_PMD_0 6
11#define VA_PMD_0 7
12#define PA_PTE_0 8
13#define VA_PTE_0 9
14#define PA_PUD_1 10
15#define VA_PUD_1 11
16#define PA_PMD_1 12
17#define VA_PMD_1 13
18#define PA_PTE_1 14
19#define VA_PTE_1 15
20#define PA_TABLE_PAGE 16
21#define PAGES_NR 17
22
23#ifndef __ASSEMBLY__
24
25#include <linux/string.h>
26
27#include <asm/page.h>
28#include <asm/ptrace.h>
29
30/*
31 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
32 * I.e. Maximum page that is mapped directly into kernel memory,
33 * and kmap is not required.
34 *
35 * So far x86_64 is limited to 40 physical address bits.
36 */
37
38/* Maximum physical address we can use pages from */
39#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
40/* Maximum address we can reach in physical address mode */
41#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
42/* Maximum address we can use for the control pages */
43#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
44
45/* Allocate one page for the pdp and the second for the code */
46#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL)
47
48/* The native architecture */
49#define KEXEC_ARCH KEXEC_ARCH_X86_64
50
51/*
52 * Saving the registers of the cpu on which panic occured in
53 * crash_kexec to save a valid sp. The registers of other cpus
54 * will be saved in machine_crash_shutdown while shooting down them.
55 */
56
57static inline void crash_setup_regs(struct pt_regs *newregs,
58 struct pt_regs *oldregs)
59{
60 if (oldregs)
61 memcpy(newregs, oldregs, sizeof(*newregs));
62 else {
63 __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
64 __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
65 __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
66 __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
67 __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
68 __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
69 __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
70 __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
71 __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
72 __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
73 __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
74 __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
75 __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
76 __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
77 __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
78 __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
79 __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
80 __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
81 __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
82
83 newregs->rip = (unsigned long)current_text_addr();
84 }
85}
86
87NORET_TYPE void
88relocate_kernel(unsigned long indirection_page,
89 unsigned long page_list,
90 unsigned long start_address) ATTRIB_NORET;
91
92#endif /* __ASSEMBLY__ */
93
94#endif /* _X86_64_KEXEC_H */
diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h
index b7bbd25ba2a6..143476a3cb52 100644
--- a/include/asm-x86/kprobes.h
+++ b/include/asm-x86/kprobes.h
@@ -1,5 +1,98 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_KPROBES_H
2# include "kprobes_32.h" 2#define _ASM_KPROBES_H
3#else 3/*
4# include "kprobes_64.h" 4 * Kernel Probes (KProbes)
5#endif 5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2002, 2004
21 *
22 * See arch/x86/kernel/kprobes.c for x86 kprobes history.
23 */
24#include <linux/types.h>
25#include <linux/ptrace.h>
26#include <linux/percpu.h>
27
28#define __ARCH_WANT_KPROBES_INSN_SLOT
29
30struct pt_regs;
31struct kprobe;
32
33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9
36#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
39 (((unsigned long)current_thread_info()) + THREAD_SIZE \
40 - (unsigned long)(ADDR))) \
41 ? (MAX_STACK_SIZE) \
42 : (((unsigned long)current_thread_info()) + THREAD_SIZE \
43 - (unsigned long)(ADDR)))
44
45#define ARCH_SUPPORTS_KRETPROBES
46#define flush_insn_slot(p) do { } while (0)
47
48extern const int kretprobe_blacklist_size;
49
50void arch_remove_kprobe(struct kprobe *p);
51void kretprobe_trampoline(void);
52
53/* Architecture specific copy of original instruction*/
54struct arch_specific_insn {
55 /* copy of the original instruction */
56 kprobe_opcode_t *insn;
57 /*
58 * boostable = -1: This instruction type is not boostable.
59 * boostable = 0: This instruction type is boostable.
60 * boostable = 1: This instruction has been boosted: we have
61 * added a relative jump after the instruction copy in insn,
62 * so no single-step and fixup are needed (unless there's
63 * a post_handler or break_handler).
64 */
65 int boostable;
66};
67
68struct prev_kprobe {
69 struct kprobe *kp;
70 unsigned long status;
71 unsigned long old_flags;
72 unsigned long saved_flags;
73};
74
75/* per-cpu kprobe control block */
76struct kprobe_ctlblk {
77 unsigned long kprobe_status;
78 unsigned long kprobe_old_flags;
79 unsigned long kprobe_saved_flags;
80 unsigned long *jprobe_saved_sp;
81 struct pt_regs jprobe_saved_regs;
82 kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
83 struct prev_kprobe prev_kprobe;
84};
85
86/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
87 * if necessary, before executing the original int3/1 (trap) handler.
88 */
89static inline void restore_interrupts(struct pt_regs *regs)
90{
91 if (regs->flags & X86_EFLAGS_IF)
92 local_irq_enable();
93}
94
95extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
96extern int kprobe_exceptions_notify(struct notifier_block *self,
97 unsigned long val, void *data);
98#endif /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
deleted file mode 100644
index 9fe8f3bddfd5..000000000000
--- a/include/asm-x86/kprobes_32.h
+++ /dev/null
@@ -1,94 +0,0 @@
1#ifndef _ASM_KPROBES_H
2#define _ASM_KPROBES_H
3/*
4 * Kernel Probes (KProbes)
5 * include/asm-i386/kprobes.h
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 *
21 * Copyright (C) IBM Corporation, 2002, 2004
22 *
23 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
24 * Probes initial implementation ( includes suggestions from
25 * Rusty Russell).
26 */
27#include <linux/types.h>
28#include <linux/ptrace.h>
29
30#define __ARCH_WANT_KPROBES_INSN_SLOT
31
32struct kprobe;
33struct pt_regs;
34
35typedef u8 kprobe_opcode_t;
36#define BREAKPOINT_INSTRUCTION 0xcc
37#define RELATIVEJUMP_INSTRUCTION 0xe9
38#define MAX_INSN_SIZE 16
39#define MAX_STACK_SIZE 64
40#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
41 (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
42 ? (MAX_STACK_SIZE) \
43 : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
44
45#define ARCH_SUPPORTS_KRETPROBES
46#define flush_insn_slot(p) do { } while (0)
47
48extern const int kretprobe_blacklist_size;
49
50void arch_remove_kprobe(struct kprobe *p);
51void kretprobe_trampoline(void);
52
53/* Architecture specific copy of original instruction*/
54struct arch_specific_insn {
55 /* copy of the original instruction */
56 kprobe_opcode_t *insn;
57 /*
58 * If this flag is not 0, this kprobe can be boost when its
59 * post_handler and break_handler is not set.
60 */
61 int boostable;
62};
63
64struct prev_kprobe {
65 struct kprobe *kp;
66 unsigned long status;
67 unsigned long old_eflags;
68 unsigned long saved_eflags;
69};
70
71/* per-cpu kprobe control block */
72struct kprobe_ctlblk {
73 unsigned long kprobe_status;
74 unsigned long kprobe_old_eflags;
75 unsigned long kprobe_saved_eflags;
76 unsigned long *jprobe_saved_esp;
77 struct pt_regs jprobe_saved_regs;
78 kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
79 struct prev_kprobe prev_kprobe;
80};
81
82/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
83 * if necessary, before executing the original int3/1 (trap) handler.
84 */
85static inline void restore_interrupts(struct pt_regs *regs)
86{
87 if (regs->eflags & IF_MASK)
88 local_irq_enable();
89}
90
91extern int kprobe_exceptions_notify(struct notifier_block *self,
92 unsigned long val, void *data);
93extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
94#endif /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
deleted file mode 100644
index 743d76218fc9..000000000000
--- a/include/asm-x86/kprobes_64.h
+++ /dev/null
@@ -1,90 +0,0 @@
1#ifndef _ASM_KPROBES_H
2#define _ASM_KPROBES_H
3/*
4 * Kernel Probes (KProbes)
5 * include/asm-x86_64/kprobes.h
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 *
21 * Copyright (C) IBM Corporation, 2002, 2004
22 *
23 * 2004-Oct Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
24 * kenistoj@us.ibm.com adopted from i386.
25 */
26#include <linux/types.h>
27#include <linux/ptrace.h>
28#include <linux/percpu.h>
29
30#define __ARCH_WANT_KPROBES_INSN_SLOT
31
32struct pt_regs;
33struct kprobe;
34
35typedef u8 kprobe_opcode_t;
36#define BREAKPOINT_INSTRUCTION 0xcc
37#define MAX_INSN_SIZE 15
38#define MAX_STACK_SIZE 64
39#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
40 (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
41 ? (MAX_STACK_SIZE) \
42 : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
43
44#define ARCH_SUPPORTS_KRETPROBES
45extern const int kretprobe_blacklist_size;
46
47void kretprobe_trampoline(void);
48extern void arch_remove_kprobe(struct kprobe *p);
49#define flush_insn_slot(p) do { } while (0)
50
51/* Architecture specific copy of original instruction*/
52struct arch_specific_insn {
53 /* copy of the original instruction */
54 kprobe_opcode_t *insn;
55};
56
57struct prev_kprobe {
58 struct kprobe *kp;
59 unsigned long status;
60 unsigned long old_rflags;
61 unsigned long saved_rflags;
62};
63
64/* per-cpu kprobe control block */
65struct kprobe_ctlblk {
66 unsigned long kprobe_status;
67 unsigned long kprobe_old_rflags;
68 unsigned long kprobe_saved_rflags;
69 unsigned long *jprobe_saved_rsp;
70 struct pt_regs jprobe_saved_regs;
71 kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
72 struct prev_kprobe prev_kprobe;
73};
74
75/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
76 * if necessary, before executing the original int3/1 (trap) handler.
77 */
78static inline void restore_interrupts(struct pt_regs *regs)
79{
80 if (regs->eflags & IF_MASK)
81 local_irq_enable();
82}
83
84extern int post_kprobe_handler(struct pt_regs *regs);
85extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
86extern int kprobe_handler(struct pt_regs *regs);
87
88extern int kprobe_exceptions_notify(struct notifier_block *self,
89 unsigned long val, void *data);
90#endif /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
new file mode 100644
index 000000000000..7a71120426a3
--- /dev/null
+++ b/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
1#ifndef __LINUX_KVM_X86_H
2#define __LINUX_KVM_X86_H
3
4/*
5 * KVM x86 specific structures and definitions
6 *
7 */
8
9#include <asm/types.h>
10#include <linux/ioctl.h>
11
12/* Architectural interrupt line count. */
13#define KVM_NR_INTERRUPTS 256
14
15struct kvm_memory_alias {
16 __u32 slot; /* this has a different namespace than memory slots */
17 __u32 flags;
18 __u64 guest_phys_addr;
19 __u64 memory_size;
20 __u64 target_phys_addr;
21};
22
23/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
24struct kvm_pic_state {
25 __u8 last_irr; /* edge detection */
26 __u8 irr; /* interrupt request register */
27 __u8 imr; /* interrupt mask register */
28 __u8 isr; /* interrupt service register */
29 __u8 priority_add; /* highest irq priority */
30 __u8 irq_base;
31 __u8 read_reg_select;
32 __u8 poll;
33 __u8 special_mask;
34 __u8 init_state;
35 __u8 auto_eoi;
36 __u8 rotate_on_auto_eoi;
37 __u8 special_fully_nested_mode;
38 __u8 init4; /* true if 4 byte init */
39 __u8 elcr; /* PIIX edge/trigger selection */
40 __u8 elcr_mask;
41};
42
43#define KVM_IOAPIC_NUM_PINS 24
44struct kvm_ioapic_state {
45 __u64 base_address;
46 __u32 ioregsel;
47 __u32 id;
48 __u32 irr;
49 __u32 pad;
50 union {
51 __u64 bits;
52 struct {
53 __u8 vector;
54 __u8 delivery_mode:3;
55 __u8 dest_mode:1;
56 __u8 delivery_status:1;
57 __u8 polarity:1;
58 __u8 remote_irr:1;
59 __u8 trig_mode:1;
60 __u8 mask:1;
61 __u8 reserve:7;
62 __u8 reserved[4];
63 __u8 dest_id;
64 } fields;
65 } redirtbl[KVM_IOAPIC_NUM_PINS];
66};
67
68#define KVM_IRQCHIP_PIC_MASTER 0
69#define KVM_IRQCHIP_PIC_SLAVE 1
70#define KVM_IRQCHIP_IOAPIC 2
71
72/* for KVM_GET_REGS and KVM_SET_REGS */
73struct kvm_regs {
74 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
75 __u64 rax, rbx, rcx, rdx;
76 __u64 rsi, rdi, rsp, rbp;
77 __u64 r8, r9, r10, r11;
78 __u64 r12, r13, r14, r15;
79 __u64 rip, rflags;
80};
81
82/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
83#define KVM_APIC_REG_SIZE 0x400
84struct kvm_lapic_state {
85 char regs[KVM_APIC_REG_SIZE];
86};
87
88struct kvm_segment {
89 __u64 base;
90 __u32 limit;
91 __u16 selector;
92 __u8 type;
93 __u8 present, dpl, db, s, l, g, avl;
94 __u8 unusable;
95 __u8 padding;
96};
97
98struct kvm_dtable {
99 __u64 base;
100 __u16 limit;
101 __u16 padding[3];
102};
103
104
105/* for KVM_GET_SREGS and KVM_SET_SREGS */
106struct kvm_sregs {
107 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
108 struct kvm_segment cs, ds, es, fs, gs, ss;
109 struct kvm_segment tr, ldt;
110 struct kvm_dtable gdt, idt;
111 __u64 cr0, cr2, cr3, cr4, cr8;
112 __u64 efer;
113 __u64 apic_base;
114 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
115};
116
117/* for KVM_GET_FPU and KVM_SET_FPU */
118struct kvm_fpu {
119 __u8 fpr[8][16];
120 __u16 fcw;
121 __u16 fsw;
122 __u8 ftwx; /* in fxsave format */
123 __u8 pad1;
124 __u16 last_opcode;
125 __u64 last_ip;
126 __u64 last_dp;
127 __u8 xmm[16][16];
128 __u32 mxcsr;
129 __u32 pad2;
130};
131
132struct kvm_msr_entry {
133 __u32 index;
134 __u32 reserved;
135 __u64 data;
136};
137
138/* for KVM_GET_MSRS and KVM_SET_MSRS */
139struct kvm_msrs {
140 __u32 nmsrs; /* number of msrs in entries */
141 __u32 pad;
142
143 struct kvm_msr_entry entries[0];
144};
145
146/* for KVM_GET_MSR_INDEX_LIST */
147struct kvm_msr_list {
148 __u32 nmsrs; /* number of msrs in entries */
149 __u32 indices[0];
150};
151
152
153struct kvm_cpuid_entry {
154 __u32 function;
155 __u32 eax;
156 __u32 ebx;
157 __u32 ecx;
158 __u32 edx;
159 __u32 padding;
160};
161
162/* for KVM_SET_CPUID */
163struct kvm_cpuid {
164 __u32 nent;
165 __u32 padding;
166 struct kvm_cpuid_entry entries[0];
167};
168
169struct kvm_cpuid_entry2 {
170 __u32 function;
171 __u32 index;
172 __u32 flags;
173 __u32 eax;
174 __u32 ebx;
175 __u32 ecx;
176 __u32 edx;
177 __u32 padding[3];
178};
179
180#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
181#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
182#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
183
184/* for KVM_SET_CPUID2 */
185struct kvm_cpuid2 {
186 __u32 nent;
187 __u32 padding;
188 struct kvm_cpuid_entry2 entries[0];
189};
190
191#endif
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h
index 3b0bc4bda5f2..4702b04b979a 100644
--- a/drivers/kvm/kvm.h
+++ b/include/asm-x86/kvm_host.h
@@ -1,23 +1,24 @@
1#ifndef __KVM_H 1#/*
2#define __KVM_H 2 * Kernel-based Virtual Machine driver for Linux
3 3 *
4/* 4 * This header defines architecture specific interfaces, x86 version
5 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See 6 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory. 7 * the COPYING file in the top-level directory.
8 *
7 */ 9 */
8 10
11#ifndef ASM_KVM_HOST_H
12#define ASM_KVM_HOST_H
13
9#include <linux/types.h> 14#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/preempt.h>
17#include <asm/signal.h>
18 16
19#include <linux/kvm.h> 17#include <linux/kvm.h>
20#include <linux/kvm_para.h> 18#include <linux/kvm_para.h>
19#include <linux/kvm_types.h>
20
21#include <asm/desc.h>
21 22
22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 23#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 24#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -37,15 +38,8 @@
37#define INVALID_PAGE (~(hpa_t)0) 38#define INVALID_PAGE (~(hpa_t)0)
38#define UNMAPPED_GVA (~(gpa_t)0) 39#define UNMAPPED_GVA (~(gpa_t)0)
39 40
40#define KVM_MAX_VCPUS 4
41#define KVM_ALIAS_SLOTS 4
42#define KVM_MEMORY_SLOTS 8
43#define KVM_NUM_MMU_PAGES 1024
44#define KVM_MIN_FREE_MMU_PAGES 5
45#define KVM_REFILL_PAGES 25
46#define KVM_MAX_CPUID_ENTRIES 40
47
48#define DE_VECTOR 0 41#define DE_VECTOR 0
42#define UD_VECTOR 6
49#define NM_VECTOR 7 43#define NM_VECTOR 7
50#define DF_VECTOR 8 44#define DF_VECTOR 8
51#define TS_VECTOR 10 45#define TS_VECTOR 10
@@ -59,31 +53,66 @@
59 53
60#define IOPL_SHIFT 12 54#define IOPL_SHIFT 12
61 55
62#define KVM_PIO_PAGE_OFFSET 1 56#define KVM_ALIAS_SLOTS 4
63 57
64/* 58#define KVM_PERMILLE_MMU_PAGES 20
65 * vcpu->requests bit members 59#define KVM_MIN_ALLOC_MMU_PAGES 64
66 */ 60#define KVM_NUM_MMU_PAGES 1024
67#define KVM_TLB_FLUSH 0 61#define KVM_MIN_FREE_MMU_PAGES 5
62#define KVM_REFILL_PAGES 25
63#define KVM_MAX_CPUID_ENTRIES 40
68 64
69/* 65extern spinlock_t kvm_lock;
70 * Address types: 66extern struct list_head vm_list;
71 * 67
72 * gva - guest virtual address 68struct kvm_vcpu;
73 * gpa - guest physical address 69struct kvm;
74 * gfn - guest frame number 70
75 * hva - host virtual address 71enum {
76 * hpa - host physical address 72 VCPU_REGS_RAX = 0,
77 * hfn - host frame number 73 VCPU_REGS_RCX = 1,
78 */ 74 VCPU_REGS_RDX = 2,
75 VCPU_REGS_RBX = 3,
76 VCPU_REGS_RSP = 4,
77 VCPU_REGS_RBP = 5,
78 VCPU_REGS_RSI = 6,
79 VCPU_REGS_RDI = 7,
80#ifdef CONFIG_X86_64
81 VCPU_REGS_R8 = 8,
82 VCPU_REGS_R9 = 9,
83 VCPU_REGS_R10 = 10,
84 VCPU_REGS_R11 = 11,
85 VCPU_REGS_R12 = 12,
86 VCPU_REGS_R13 = 13,
87 VCPU_REGS_R14 = 14,
88 VCPU_REGS_R15 = 15,
89#endif
90 NR_VCPU_REGS
91};
92
93enum {
94 VCPU_SREG_CS,
95 VCPU_SREG_DS,
96 VCPU_SREG_ES,
97 VCPU_SREG_FS,
98 VCPU_SREG_GS,
99 VCPU_SREG_SS,
100 VCPU_SREG_TR,
101 VCPU_SREG_LDTR,
102};
79 103
80typedef unsigned long gva_t; 104#include <asm/kvm_x86_emulate.h>
81typedef u64 gpa_t;
82typedef unsigned long gfn_t;
83 105
84typedef unsigned long hva_t; 106#define KVM_NR_MEM_OBJS 40
85typedef u64 hpa_t; 107
86typedef unsigned long hfn_t; 108/*
109 * We don't want allocation failures within the mmu code, so we preallocate
110 * enough memory for a single page fault in a cache.
111 */
112struct kvm_mmu_memory_cache {
113 int nobjs;
114 void *objects[KVM_NR_MEM_OBJS];
115};
87 116
88#define NR_PTE_CHAIN_ENTRIES 5 117#define NR_PTE_CHAIN_ENTRIES 5
89 118
@@ -99,7 +128,7 @@ struct kvm_pte_chain {
99 * bits 4:7 - page table level for this shadow (1-4) 128 * bits 4:7 - page table level for this shadow (1-4)
100 * bits 8:9 - page table quadrant for 2-level guests 129 * bits 8:9 - page table quadrant for 2-level guests
101 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) 130 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
102 * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde 131 * bits 17:19 - common access permissions for all ptes in this shadow page
103 */ 132 */
104union kvm_mmu_page_role { 133union kvm_mmu_page_role {
105 unsigned word; 134 unsigned word;
@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
109 unsigned quadrant : 2; 138 unsigned quadrant : 2;
110 unsigned pad_for_nice_hex_output : 6; 139 unsigned pad_for_nice_hex_output : 6;
111 unsigned metaphysical : 1; 140 unsigned metaphysical : 1;
112 unsigned hugepage_access : 3; 141 unsigned access : 3;
113 }; 142 };
114}; 143};
115 144
@@ -125,6 +154,8 @@ struct kvm_mmu_page {
125 union kvm_mmu_page_role role; 154 union kvm_mmu_page_role role;
126 155
127 u64 *spt; 156 u64 *spt;
157 /* hold the gfn of each spte inside spt */
158 gfn_t *gfns;
128 unsigned long slot_bitmap; /* One bit set per slot which has memory 159 unsigned long slot_bitmap; /* One bit set per slot which has memory
129 * in this shadow page. 160 * in this shadow page.
130 */ 161 */
@@ -136,9 +167,6 @@ struct kvm_mmu_page {
136 }; 167 };
137}; 168};
138 169
139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
141
142/* 170/*
143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 171 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
144 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 172 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -149,6 +177,8 @@ struct kvm_mmu {
149 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 177 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
150 void (*free)(struct kvm_vcpu *vcpu); 178 void (*free)(struct kvm_vcpu *vcpu);
151 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 179 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
180 void (*prefetch_page)(struct kvm_vcpu *vcpu,
181 struct kvm_mmu_page *page);
152 hpa_t root_hpa; 182 hpa_t root_hpa;
153 int root_level; 183 int root_level;
154 int shadow_root_level; 184 int shadow_root_level;
@@ -156,159 +186,9 @@ struct kvm_mmu {
156 u64 *pae_root; 186 u64 *pae_root;
157}; 187};
158 188
159#define KVM_NR_MEM_OBJS 20 189struct kvm_vcpu_arch {
160
161struct kvm_mmu_memory_cache {
162 int nobjs;
163 void *objects[KVM_NR_MEM_OBJS];
164};
165
166/*
167 * We don't want allocation failures within the mmu code, so we preallocate
168 * enough memory for a single page fault in a cache.
169 */
170struct kvm_guest_debug {
171 int enabled;
172 unsigned long bp[4];
173 int singlestep;
174};
175
176enum {
177 VCPU_REGS_RAX = 0,
178 VCPU_REGS_RCX = 1,
179 VCPU_REGS_RDX = 2,
180 VCPU_REGS_RBX = 3,
181 VCPU_REGS_RSP = 4,
182 VCPU_REGS_RBP = 5,
183 VCPU_REGS_RSI = 6,
184 VCPU_REGS_RDI = 7,
185#ifdef CONFIG_X86_64
186 VCPU_REGS_R8 = 8,
187 VCPU_REGS_R9 = 9,
188 VCPU_REGS_R10 = 10,
189 VCPU_REGS_R11 = 11,
190 VCPU_REGS_R12 = 12,
191 VCPU_REGS_R13 = 13,
192 VCPU_REGS_R14 = 14,
193 VCPU_REGS_R15 = 15,
194#endif
195 NR_VCPU_REGS
196};
197
198enum {
199 VCPU_SREG_CS,
200 VCPU_SREG_DS,
201 VCPU_SREG_ES,
202 VCPU_SREG_FS,
203 VCPU_SREG_GS,
204 VCPU_SREG_SS,
205 VCPU_SREG_TR,
206 VCPU_SREG_LDTR,
207};
208
209struct kvm_pio_request {
210 unsigned long count;
211 int cur_count;
212 struct page *guest_pages[2];
213 unsigned guest_page_offset;
214 int in;
215 int port;
216 int size;
217 int string;
218 int down;
219 int rep;
220};
221
222struct kvm_stat {
223 u32 pf_fixed;
224 u32 pf_guest;
225 u32 tlb_flush;
226 u32 invlpg;
227
228 u32 exits;
229 u32 io_exits;
230 u32 mmio_exits;
231 u32 signal_exits;
232 u32 irq_window_exits;
233 u32 halt_exits;
234 u32 halt_wakeup;
235 u32 request_irq_exits;
236 u32 irq_exits;
237 u32 light_exits;
238 u32 efer_reload;
239};
240
241struct kvm_io_device {
242 void (*read)(struct kvm_io_device *this,
243 gpa_t addr,
244 int len,
245 void *val);
246 void (*write)(struct kvm_io_device *this,
247 gpa_t addr,
248 int len,
249 const void *val);
250 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
251 void (*destructor)(struct kvm_io_device *this);
252
253 void *private;
254};
255
256static inline void kvm_iodevice_read(struct kvm_io_device *dev,
257 gpa_t addr,
258 int len,
259 void *val)
260{
261 dev->read(dev, addr, len, val);
262}
263
264static inline void kvm_iodevice_write(struct kvm_io_device *dev,
265 gpa_t addr,
266 int len,
267 const void *val)
268{
269 dev->write(dev, addr, len, val);
270}
271
272static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
273{
274 return dev->in_range(dev, addr);
275}
276
277static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
278{
279 if (dev->destructor)
280 dev->destructor(dev);
281}
282
283/*
284 * It would be nice to use something smarter than a linear search, TBD...
285 * Thankfully we dont expect many devices to register (famous last words :),
286 * so until then it will suffice. At least its abstracted so we can change
287 * in one place.
288 */
289struct kvm_io_bus {
290 int dev_count;
291#define NR_IOBUS_DEVS 6
292 struct kvm_io_device *devs[NR_IOBUS_DEVS];
293};
294
295void kvm_io_bus_init(struct kvm_io_bus *bus);
296void kvm_io_bus_destroy(struct kvm_io_bus *bus);
297struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
298void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
299 struct kvm_io_device *dev);
300
301struct kvm_vcpu {
302 struct kvm *kvm;
303 struct preempt_notifier preempt_notifier;
304 int vcpu_id;
305 struct mutex mutex;
306 int cpu;
307 u64 host_tsc; 190 u64 host_tsc;
308 struct kvm_run *run;
309 int interrupt_window_open; 191 int interrupt_window_open;
310 int guest_mode;
311 unsigned long requests;
312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 192 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); 193 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ 194 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -317,9 +197,6 @@ struct kvm_vcpu {
317 unsigned long cr0; 197 unsigned long cr0;
318 unsigned long cr2; 198 unsigned long cr2;
319 unsigned long cr3; 199 unsigned long cr3;
320 gpa_t para_state_gpa;
321 struct page *para_state_page;
322 gpa_t hypercall_gpa;
323 unsigned long cr4; 200 unsigned long cr4;
324 unsigned long cr8; 201 unsigned long cr8;
325 u64 pdptrs[4]; /* pae */ 202 u64 pdptrs[4]; /* pae */
@@ -334,6 +211,7 @@ struct kvm_vcpu {
334 int mp_state; 211 int mp_state;
335 int sipi_vector; 212 int sipi_vector;
336 u64 ia32_misc_enable_msr; 213 u64 ia32_misc_enable_msr;
214 bool tpr_access_reporting;
337 215
338 struct kvm_mmu mmu; 216 struct kvm_mmu mmu;
339 217
@@ -344,29 +222,26 @@ struct kvm_vcpu {
344 222
345 gfn_t last_pt_write_gfn; 223 gfn_t last_pt_write_gfn;
346 int last_pt_write_count; 224 int last_pt_write_count;
225 u64 *last_pte_updated;
347 226
348 struct kvm_guest_debug guest_debug; 227 struct {
228 gfn_t gfn; /* presumed gfn during guest pte update */
229 struct page *page; /* page corresponding to that gfn */
230 } update_pte;
349 231
350 struct i387_fxsave_struct host_fx_image; 232 struct i387_fxsave_struct host_fx_image;
351 struct i387_fxsave_struct guest_fx_image; 233 struct i387_fxsave_struct guest_fx_image;
352 int fpu_active; 234
353 int guest_fpu_loaded;
354
355 int mmio_needed;
356 int mmio_read_completed;
357 int mmio_is_write;
358 int mmio_size;
359 unsigned char mmio_data[8];
360 gpa_t mmio_phys_addr;
361 gva_t mmio_fault_cr2; 235 gva_t mmio_fault_cr2;
362 struct kvm_pio_request pio; 236 struct kvm_pio_request pio;
363 void *pio_data; 237 void *pio_data;
364 wait_queue_head_t wq;
365 238
366 int sigset_active; 239 struct kvm_queued_exception {
367 sigset_t sigset; 240 bool pending;
368 241 bool has_error_code;
369 struct kvm_stat stat; 242 u8 nr;
243 u32 error_code;
244 } exception;
370 245
371 struct { 246 struct {
372 int active; 247 int active;
@@ -381,7 +256,10 @@ struct kvm_vcpu {
381 int halt_request; /* real mode on Intel only */ 256 int halt_request; /* real mode on Intel only */
382 257
383 int cpuid_nent; 258 int cpuid_nent;
384 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 259 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
260 /* emulate context */
261
262 struct x86_emulate_ctxt emulate_ctxt;
385}; 263};
386 264
387struct kvm_mem_alias { 265struct kvm_mem_alias {
@@ -390,51 +268,58 @@ struct kvm_mem_alias {
390 gfn_t target_gfn; 268 gfn_t target_gfn;
391}; 269};
392 270
393struct kvm_memory_slot { 271struct kvm_arch{
394 gfn_t base_gfn;
395 unsigned long npages;
396 unsigned long flags;
397 struct page **phys_mem;
398 unsigned long *dirty_bitmap;
399};
400
401struct kvm {
402 struct mutex lock; /* protects everything except vcpus */
403 int naliases; 272 int naliases;
404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 273 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
405 int nmemslots; 274
406 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 275 unsigned int n_free_mmu_pages;
276 unsigned int n_requested_mmu_pages;
277 unsigned int n_alloc_mmu_pages;
278 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
407 /* 279 /*
408 * Hash table of struct kvm_mmu_page. 280 * Hash table of struct kvm_mmu_page.
409 */ 281 */
410 struct list_head active_mmu_pages; 282 struct list_head active_mmu_pages;
411 int n_free_mmu_pages;
412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
414 unsigned long rmap_overflow;
415 struct list_head vm_list;
416 struct file *filp;
417 struct kvm_io_bus mmio_bus;
418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic; 283 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic; 284 struct kvm_ioapic *vioapic;
285
421 int round_robin_prev_vcpu; 286 int round_robin_prev_vcpu;
287 unsigned int tss_addr;
288 struct page *apic_access_page;
422}; 289};
423 290
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 291struct kvm_vm_stat {
425{ 292 u32 mmu_shadow_zapped;
426 return kvm->vpic; 293 u32 mmu_pte_write;
427} 294 u32 mmu_pte_updated;
295 u32 mmu_pde_zapped;
296 u32 mmu_flooded;
297 u32 mmu_recycled;
298 u32 mmu_cache_miss;
299 u32 remote_tlb_flush;
300};
428 301
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) 302struct kvm_vcpu_stat {
430{ 303 u32 pf_fixed;
431 return kvm->vioapic; 304 u32 pf_guest;
432} 305 u32 tlb_flush;
306 u32 invlpg;
433 307
434static inline int irqchip_in_kernel(struct kvm *kvm) 308 u32 exits;
435{ 309 u32 io_exits;
436 return pic_irqchip(kvm) != 0; 310 u32 mmio_exits;
437} 311 u32 signal_exits;
312 u32 irq_window_exits;
313 u32 halt_exits;
314 u32 halt_wakeup;
315 u32 request_irq_exits;
316 u32 irq_exits;
317 u32 host_state_reload;
318 u32 efer_reload;
319 u32 fpu_reload;
320 u32 insn_emulation;
321 u32 insn_emulation_fail;
322};
438 323
439struct descriptor_table { 324struct descriptor_table {
440 u16 limit; 325 u16 limit;
@@ -449,11 +334,12 @@ struct kvm_x86_ops {
449 void (*check_processor_compatibility)(void *rtn); 334 void (*check_processor_compatibility)(void *rtn);
450 int (*hardware_setup)(void); /* __init */ 335 int (*hardware_setup)(void); /* __init */
451 void (*hardware_unsetup)(void); /* __exit */ 336 void (*hardware_unsetup)(void); /* __exit */
337 bool (*cpu_has_accelerated_tpr)(void);
452 338
453 /* Create, but do not attach this VCPU */ 339 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 340 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
455 void (*vcpu_free)(struct kvm_vcpu *vcpu); 341 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu); 342 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
457 343
458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 344 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 345 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -489,10 +375,6 @@ struct kvm_x86_ops {
489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 375 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
490 376
491 void (*tlb_flush)(struct kvm_vcpu *vcpu); 377 void (*tlb_flush)(struct kvm_vcpu *vcpu);
492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
493 unsigned long addr, u32 err_code);
494
495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
496 378
497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 379 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 380 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -501,54 +383,31 @@ struct kvm_x86_ops {
501 unsigned char *hypercall_addr); 383 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu); 384 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 385 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
386 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
387 bool has_error_code, u32 error_code);
388 bool (*exception_injected)(struct kvm_vcpu *vcpu);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 389 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 390 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run); 391 struct kvm_run *run);
392
393 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
507}; 394};
508 395
509extern struct kvm_x86_ops *kvm_x86_ops; 396extern struct kvm_x86_ops *kvm_x86_ops;
510 397
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
518
519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
521
522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
528
529int kvm_mmu_module_init(void); 398int kvm_mmu_module_init(void);
530void kvm_mmu_module_exit(void); 399void kvm_mmu_module_exit(void);
531 400
532void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 401void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
533int kvm_mmu_create(struct kvm_vcpu *vcpu); 402int kvm_mmu_create(struct kvm_vcpu *vcpu);
534int kvm_mmu_setup(struct kvm_vcpu *vcpu); 403int kvm_mmu_setup(struct kvm_vcpu *vcpu);
404void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
535 405
536int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 406int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
537void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 407void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
538void kvm_mmu_zap_all(struct kvm *kvm); 408void kvm_mmu_zap_all(struct kvm *kvm);
539 409unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
540hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); 410void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
541#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
542#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
543static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
546
547extern hpa_t bad_page_address;
548
549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
550struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
551void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
552 411
553enum emulation_result { 412enum emulation_result {
554 EMULATE_DONE, /* no further processing */ 413 EMULATE_DONE, /* no further processing */
@@ -556,8 +415,10 @@ enum emulation_result {
556 EMULATE_FAIL, /* can't emulate this instruction */ 415 EMULATE_FAIL, /* can't emulate this instruction */
557}; 416};
558 417
418#define EMULTYPE_NO_DECODE (1 << 0)
419#define EMULTYPE_TRAP_UD (1 << 1)
559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 420int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
560 unsigned long cr2, u16 error_code); 421 unsigned long cr2, u16 error_code, int emulation_type);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 422void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 424void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
572 433
573struct x86_emulate_ctxt; 434struct x86_emulate_ctxt;
574 435
575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 436int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
576 int size, unsigned port); 437 int size, unsigned port);
577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 438int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down, 439 int size, unsigned long count, int down,
@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu); 442int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 443int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
583int emulate_clts(struct kvm_vcpu *vcpu); 444int emulate_clts(struct kvm_vcpu *vcpu);
584int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, 445int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
585 unsigned long *dest); 446 unsigned long *dest);
586int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, 447int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
587 unsigned long value); 448 unsigned long value);
@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 458int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 459int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
599 460
600void fx_init(struct kvm_vcpu *vcpu); 461void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
462void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
463void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
464 u32 error_code);
601 465
602void kvm_resched(struct kvm_vcpu *vcpu); 466void fx_init(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm);
606 467
607int emulator_read_std(unsigned long addr, 468int emulator_read_std(unsigned long addr,
608 void *val, 469 void *val,
609 unsigned int bytes, 470 unsigned int bytes,
610 struct kvm_vcpu *vcpu); 471 struct kvm_vcpu *vcpu);
611int emulator_write_emulated(unsigned long addr, 472int emulator_write_emulated(unsigned long addr,
@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr,
615 476
616unsigned long segment_base(u16 selector); 477unsigned long segment_base(u16 selector);
617 478
479void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 480void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
619 const u8 *new, int bytes); 481 const u8 *new, int bytes);
620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 482int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
622int kvm_mmu_load(struct kvm_vcpu *vcpu); 484int kvm_mmu_load(struct kvm_vcpu *vcpu);
623void kvm_mmu_unload(struct kvm_vcpu *vcpu); 485void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624 486
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); 487int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
626 488
627static inline void kvm_guest_enter(void) 489int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
628{
629 current->flags |= PF_VCPU;
630}
631 490
632static inline void kvm_guest_exit(void) 491int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
633{
634 current->flags &= ~PF_VCPU;
635}
636 492
637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 493int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
638 u32 error_code) 494int complete_pio(struct kvm_vcpu *vcpu);
639{
640 return vcpu->mmu.page_fault(vcpu, gva, error_code);
641}
642
643static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
644{
645 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
646 __kvm_mmu_free_some_pages(vcpu);
647}
648
649static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
650{
651 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
652 return 0;
653
654 return kvm_mmu_load(vcpu);
655}
656
657static inline int is_long_mode(struct kvm_vcpu *vcpu)
658{
659#ifdef CONFIG_X86_64
660 return vcpu->shadow_efer & EFER_LME;
661#else
662 return 0;
663#endif
664}
665
666static inline int is_pae(struct kvm_vcpu *vcpu)
667{
668 return vcpu->cr4 & X86_CR4_PAE;
669}
670
671static inline int is_pse(struct kvm_vcpu *vcpu)
672{
673 return vcpu->cr4 & X86_CR4_PSE;
674}
675
676static inline int is_paging(struct kvm_vcpu *vcpu)
677{
678 return vcpu->cr0 & X86_CR0_PG;
679}
680
681static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
682{
683 return slot - kvm->memslots;
684}
685 495
686static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 496static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
687{ 497{
@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
693static inline u16 read_fs(void) 503static inline u16 read_fs(void)
694{ 504{
695 u16 seg; 505 u16 seg;
696 asm ("mov %%fs, %0" : "=g"(seg)); 506 asm("mov %%fs, %0" : "=g"(seg));
697 return seg; 507 return seg;
698} 508}
699 509
700static inline u16 read_gs(void) 510static inline u16 read_gs(void)
701{ 511{
702 u16 seg; 512 u16 seg;
703 asm ("mov %%gs, %0" : "=g"(seg)); 513 asm("mov %%gs, %0" : "=g"(seg));
704 return seg; 514 return seg;
705} 515}
706 516
707static inline u16 read_ldt(void) 517static inline u16 read_ldt(void)
708{ 518{
709 u16 ldt; 519 u16 ldt;
710 asm ("sldt %0" : "=g"(ldt)); 520 asm("sldt %0" : "=g"(ldt));
711 return ldt; 521 return ldt;
712} 522}
713 523
714static inline void load_fs(u16 sel) 524static inline void load_fs(u16 sel)
715{ 525{
716 asm ("mov %0, %%fs" : : "rm"(sel)); 526 asm("mov %0, %%fs" : : "rm"(sel));
717} 527}
718 528
719static inline void load_gs(u16 sel) 529static inline void load_gs(u16 sel)
720{ 530{
721 asm ("mov %0, %%gs" : : "rm"(sel)); 531 asm("mov %0, %%gs" : : "rm"(sel));
722} 532}
723 533
724#ifndef load_ldt 534#ifndef load_ldt
725static inline void load_ldt(u16 sel) 535static inline void load_ldt(u16 sel)
726{ 536{
727 asm ("lldt %0" : : "rm"(sel)); 537 asm("lldt %0" : : "rm"(sel));
728} 538}
729#endif 539#endif
730 540
731static inline void get_idt(struct descriptor_table *table) 541static inline void get_idt(struct descriptor_table *table)
732{ 542{
733 asm ("sidt %0" : "=m"(*table)); 543 asm("sidt %0" : "=m"(*table));
734} 544}
735 545
736static inline void get_gdt(struct descriptor_table *table) 546static inline void get_gdt(struct descriptor_table *table)
737{ 547{
738 asm ("sgdt %0" : "=m"(*table)); 548 asm("sgdt %0" : "=m"(*table));
739} 549}
740 550
741static inline unsigned long read_tr_base(void) 551static inline unsigned long read_tr_base(void)
742{ 552{
743 u16 tr; 553 u16 tr;
744 asm ("str %0" : "=g"(tr)); 554 asm("str %0" : "=g"(tr));
745 return segment_base(tr); 555 return segment_base(tr);
746} 556}
747 557
@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr)
757 567
758static inline void fx_save(struct i387_fxsave_struct *image) 568static inline void fx_save(struct i387_fxsave_struct *image)
759{ 569{
760 asm ("fxsave (%0)":: "r" (image)); 570 asm("fxsave (%0)":: "r" (image));
761} 571}
762 572
763static inline void fx_restore(struct i387_fxsave_struct *image) 573static inline void fx_restore(struct i387_fxsave_struct *image)
764{ 574{
765 asm ("fxrstor (%0)":: "r" (image)); 575 asm("fxrstor (%0)":: "r" (image));
766} 576}
767 577
768static inline void fpu_init(void) 578static inline void fpu_init(void)
769{ 579{
770 asm ("finit"); 580 asm("finit");
771} 581}
772 582
773static inline u32 get_rdx_init_val(void) 583static inline u32 get_rdx_init_val(void)
@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void)
775 return 0x600; /* P6 family */ 585 return 0x600; /* P6 family */
776} 586}
777 587
588static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
589{
590 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
591}
592
778#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" 593#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
779#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" 594#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
780#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" 595#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
new file mode 100644
index 000000000000..c6f3fd8d8c53
--- /dev/null
+++ b/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
1#ifndef __X86_KVM_PARA_H
2#define __X86_KVM_PARA_H
3
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM.
6 */
7#define KVM_CPUID_SIGNATURE 0x40000000
8
9/* This CPUID returns a feature bitmap in eax. Before enabling a particular
10 * paravirtualization, the appropriate feature bit should be checked.
11 */
12#define KVM_CPUID_FEATURES 0x40000001
13
14#ifdef __KERNEL__
15#include <asm/processor.h>
16
17/* This instruction is vmcall. On non-VT architectures, it will generate a
18 * trap that we will then rewrite to the appropriate instruction.
19 */
20#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
21
22/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
23 * instruction. The hypervisor may replace it with something else but only the
24 * instructions are guaranteed to be supported.
25 *
26 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
27 * The hypercall number should be placed in rax and the return value will be
28 * placed in rax. No other registers will be clobbered unless explicited
29 * noted by the particular hypercall.
30 */
31
32static inline long kvm_hypercall0(unsigned int nr)
33{
34 long ret;
35 asm volatile(KVM_HYPERCALL
36 : "=a"(ret)
37 : "a"(nr));
38 return ret;
39}
40
41static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
42{
43 long ret;
44 asm volatile(KVM_HYPERCALL
45 : "=a"(ret)
46 : "a"(nr), "b"(p1));
47 return ret;
48}
49
50static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
51 unsigned long p2)
52{
53 long ret;
54 asm volatile(KVM_HYPERCALL
55 : "=a"(ret)
56 : "a"(nr), "b"(p1), "c"(p2));
57 return ret;
58}
59
60static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
61 unsigned long p2, unsigned long p3)
62{
63 long ret;
64 asm volatile(KVM_HYPERCALL
65 : "=a"(ret)
66 : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
67 return ret;
68}
69
70static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
71 unsigned long p2, unsigned long p3,
72 unsigned long p4)
73{
74 long ret;
75 asm volatile(KVM_HYPERCALL
76 : "=a"(ret)
77 : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
78 return ret;
79}
80
81static inline int kvm_para_available(void)
82{
83 unsigned int eax, ebx, ecx, edx;
84 char signature[13];
85
86 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
87 memcpy(signature + 0, &ebx, 4);
88 memcpy(signature + 4, &ecx, 4);
89 memcpy(signature + 8, &edx, 4);
90 signature[12] = 0;
91
92 if (strcmp(signature, "KVMKVMKVM") == 0)
93 return 1;
94
95 return 0;
96}
97
98static inline unsigned int kvm_arch_para_features(void)
99{
100 return cpuid_eax(KVM_CPUID_FEATURES);
101}
102
103#endif
104
105#endif
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
index 92c73aa7f9ac..7db91b9bdcd4 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/include/asm-x86/kvm_x86_emulate.h
@@ -63,17 +63,6 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu);
64 64
65 /* 65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory.
67 * Used for stack operations, and others.
68 * @addr: [IN ] Linear address to which to write.
69 * @val: [IN ] Value to write to memory (low-order bytes used as
70 * required).
71 * @bytes: [IN ] Number of bytes to write to memory.
72 */
73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct kvm_vcpu *vcpu);
75
76 /*
77 * read_emulated: Read bytes from emulated/special memory area. 66 * read_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address from which to read. 67 * @addr: [IN ] Linear address from which to read.
79 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
@@ -112,13 +101,50 @@ struct x86_emulate_ops {
112 101
113}; 102};
114 103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 unsigned long *override_base;
128 unsigned int d;
129 unsigned long regs[NR_VCPU_REGS];
130 unsigned long eip;
131 /* modrm */
132 u8 modrm;
133 u8 modrm_mod;
134 u8 modrm_reg;
135 u8 modrm_rm;
136 u8 use_modrm_ea;
137 unsigned long modrm_ea;
138 unsigned long modrm_val;
139 struct fetch_cache fetch;
140};
141
115struct x86_emulate_ctxt { 142struct x86_emulate_ctxt {
116 /* Register state before/after emulation. */ 143 /* Register state before/after emulation. */
117 struct kvm_vcpu *vcpu; 144 struct kvm_vcpu *vcpu;
118 145
119 /* Linear faulting address (if emulating a page-faulting instruction). */ 146 /* Linear faulting address (if emulating a page-faulting instruction). */
120 unsigned long eflags; 147 unsigned long eflags;
121 unsigned long cr2;
122 148
123 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 149 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
124 int mode; 150 int mode;
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
129 unsigned long ss_base; 155 unsigned long ss_base;
130 unsigned long gs_base; 156 unsigned long gs_base;
131 unsigned long fs_base; 157 unsigned long fs_base;
158
159 /* decode cache */
160
161 struct decode_cache decode;
132}; 162};
133 163
164/* Repeat String Operation Prefix */
165#define REPE_PREFIX 1
166#define REPNE_PREFIX 2
167
134/* Execution mode, passed to the emulator. */ 168/* Execution mode, passed to the emulator. */
135#define X86EMUL_MODE_REAL 0 /* Real mode. */ 169#define X86EMUL_MODE_REAL 0 /* Real mode. */
136#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 170#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
144#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 178#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
145#endif 179#endif
146 180
147/* 181int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
148 * x86_emulate_memop: Emulate an instruction that faulted attempting to 182 struct x86_emulate_ops *ops);
149 * read/write a 'special' memory area. 183int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
150 * Returns -1 on failure, 0 on success. 184 struct x86_emulate_ops *ops);
151 */
152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
153 struct x86_emulate_ops *ops);
154 185
155#endif /* __X86_EMULATE_H__ */ 186#endif /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index b9d003b8005e..4d9367b72976 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -44,14 +44,14 @@ struct lguest_ro_state
44{ 44{
45 /* Host information we need to restore when we switch back. */ 45 /* Host information we need to restore when we switch back. */
46 u32 host_cr3; 46 u32 host_cr3;
47 struct Xgt_desc_struct host_idt_desc; 47 struct desc_ptr host_idt_desc;
48 struct Xgt_desc_struct host_gdt_desc; 48 struct desc_ptr host_gdt_desc;
49 u32 host_sp; 49 u32 host_sp;
50 50
51 /* Fields which are used when guest is running. */ 51 /* Fields which are used when guest is running. */
52 struct Xgt_desc_struct guest_idt_desc; 52 struct desc_ptr guest_idt_desc;
53 struct Xgt_desc_struct guest_gdt_desc; 53 struct desc_ptr guest_gdt_desc;
54 struct i386_hw_tss guest_tss; 54 struct x86_hw_tss guest_tss;
55 struct desc_struct guest_idt[IDT_ENTRIES]; 55 struct desc_struct guest_idt[IDT_ENTRIES];
56 struct desc_struct guest_gdt[GDT_ENTRIES]; 56 struct desc_struct guest_gdt[GDT_ENTRIES];
57}; 57};
@@ -78,8 +78,8 @@ static inline void lguest_set_ts(void)
78} 78}
79 79
80/* Full 4G segment descriptors, suitable for CS and DS. */ 80/* Full 4G segment descriptors, suitable for CS and DS. */
81#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 81#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
82#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 82#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
83 83
84#endif /* __ASSEMBLY__ */ 84#endif /* __ASSEMBLY__ */
85 85
diff --git a/include/asm-x86/linkage.h b/include/asm-x86/linkage.h
index 94b257fa8701..31739c7d66a9 100644
--- a/include/asm-x86/linkage.h
+++ b/include/asm-x86/linkage.h
@@ -1,5 +1,25 @@
1#ifndef __ASM_LINKAGE_H
2#define __ASM_LINKAGE_H
3
4#ifdef CONFIG_X86_64
5#define __ALIGN .p2align 4,,15
6#define __ALIGN_STR ".p2align 4,,15"
7#endif
8
1#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
2# include "linkage_32.h" 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
3#else 11#define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
4# include "linkage_64.h" 12/*
13 * For 32-bit UML - mark functions implemented in assembly that use
14 * regparm input parameters:
15 */
16#define asmregparm __attribute__((regparm(3)))
17#endif
18
19#ifdef CONFIG_X86_ALIGNMENT_16
20#define __ALIGN .align 16,0x90
21#define __ALIGN_STR ".align 16,0x90"
22#endif
23
5#endif 24#endif
25
diff --git a/include/asm-x86/linkage_32.h b/include/asm-x86/linkage_32.h
deleted file mode 100644
index f4a6ebac0247..000000000000
--- a/include/asm-x86/linkage_32.h
+++ /dev/null
@@ -1,15 +0,0 @@
1#ifndef __ASM_LINKAGE_H
2#define __ASM_LINKAGE_H
3
4#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
5#define FASTCALL(x) x __attribute__((regparm(3)))
6#define fastcall __attribute__((regparm(3)))
7
8#define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
9
10#ifdef CONFIG_X86_ALIGNMENT_16
11#define __ALIGN .align 16,0x90
12#define __ALIGN_STR ".align 16,0x90"
13#endif
14
15#endif
diff --git a/include/asm-x86/linkage_64.h b/include/asm-x86/linkage_64.h
deleted file mode 100644
index b5f39d0189ce..000000000000
--- a/include/asm-x86/linkage_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef __ASM_LINKAGE_H
2#define __ASM_LINKAGE_H
3
4#define __ALIGN .p2align 4,,15
5
6#endif
diff --git a/include/asm-x86/local.h b/include/asm-x86/local.h
index c7a1b1c66c96..f852c62b3319 100644
--- a/include/asm-x86/local.h
+++ b/include/asm-x86/local.h
@@ -1,5 +1,240 @@
1#ifdef CONFIG_X86_32 1#ifndef _ARCH_LOCAL_H
2# include "local_32.h" 2#define _ARCH_LOCAL_H
3#else 3
4# include "local_64.h" 4#include <linux/percpu.h>
5
6#include <asm/system.h>
7#include <asm/atomic.h>
8#include <asm/asm.h>
9
10typedef struct {
11 atomic_long_t a;
12} local_t;
13
14#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
15
16#define local_read(l) atomic_long_read(&(l)->a)
17#define local_set(l, i) atomic_long_set(&(l)->a, (i))
18
19static inline void local_inc(local_t *l)
20{
21 __asm__ __volatile__(
22 _ASM_INC "%0"
23 :"+m" (l->a.counter));
24}
25
26static inline void local_dec(local_t *l)
27{
28 __asm__ __volatile__(
29 _ASM_DEC "%0"
30 :"+m" (l->a.counter));
31}
32
33static inline void local_add(long i, local_t *l)
34{
35 __asm__ __volatile__(
36 _ASM_ADD "%1,%0"
37 :"+m" (l->a.counter)
38 :"ir" (i));
39}
40
41static inline void local_sub(long i, local_t *l)
42{
43 __asm__ __volatile__(
44 _ASM_SUB "%1,%0"
45 :"+m" (l->a.counter)
46 :"ir" (i));
47}
48
49/**
50 * local_sub_and_test - subtract value from variable and test result
51 * @i: integer value to subtract
52 * @l: pointer to type local_t
53 *
54 * Atomically subtracts @i from @l and returns
55 * true if the result is zero, or false for all
56 * other cases.
57 */
58static inline int local_sub_and_test(long i, local_t *l)
59{
60 unsigned char c;
61
62 __asm__ __volatile__(
63 _ASM_SUB "%2,%0; sete %1"
64 :"+m" (l->a.counter), "=qm" (c)
65 :"ir" (i) : "memory");
66 return c;
67}
68
69/**
70 * local_dec_and_test - decrement and test
71 * @l: pointer to type local_t
72 *
73 * Atomically decrements @l by 1 and
74 * returns true if the result is 0, or false for all other
75 * cases.
76 */
77static inline int local_dec_and_test(local_t *l)
78{
79 unsigned char c;
80
81 __asm__ __volatile__(
82 _ASM_DEC "%0; sete %1"
83 :"+m" (l->a.counter), "=qm" (c)
84 : : "memory");
85 return c != 0;
86}
87
88/**
89 * local_inc_and_test - increment and test
90 * @l: pointer to type local_t
91 *
92 * Atomically increments @l by 1
93 * and returns true if the result is zero, or false for all
94 * other cases.
95 */
96static inline int local_inc_and_test(local_t *l)
97{
98 unsigned char c;
99
100 __asm__ __volatile__(
101 _ASM_INC "%0; sete %1"
102 :"+m" (l->a.counter), "=qm" (c)
103 : : "memory");
104 return c != 0;
105}
106
107/**
108 * local_add_negative - add and test if negative
109 * @i: integer value to add
110 * @l: pointer to type local_t
111 *
112 * Atomically adds @i to @l and returns true
113 * if the result is negative, or false when
114 * result is greater than or equal to zero.
115 */
116static inline int local_add_negative(long i, local_t *l)
117{
118 unsigned char c;
119
120 __asm__ __volatile__(
121 _ASM_ADD "%2,%0; sets %1"
122 :"+m" (l->a.counter), "=qm" (c)
123 :"ir" (i) : "memory");
124 return c;
125}
126
127/**
128 * local_add_return - add and return
129 * @i: integer value to add
130 * @l: pointer to type local_t
131 *
132 * Atomically adds @i to @l and returns @i + @l
133 */
134static inline long local_add_return(long i, local_t *l)
135{
136 long __i;
137#ifdef CONFIG_M386
138 unsigned long flags;
139 if (unlikely(boot_cpu_data.x86 <= 3))
140 goto no_xadd;
5#endif 141#endif
142 /* Modern 486+ processor */
143 __i = i;
144 __asm__ __volatile__(
145 _ASM_XADD "%0, %1;"
146 :"+r" (i), "+m" (l->a.counter)
147 : : "memory");
148 return i + __i;
149
150#ifdef CONFIG_M386
151no_xadd: /* Legacy 386 processor */
152 local_irq_save(flags);
153 __i = local_read(l);
154 local_set(l, i + __i);
155 local_irq_restore(flags);
156 return i + __i;
157#endif
158}
159
160static inline long local_sub_return(long i, local_t *l)
161{
162 return local_add_return(-i, l);
163}
164
165#define local_inc_return(l) (local_add_return(1, l))
166#define local_dec_return(l) (local_sub_return(1, l))
167
168#define local_cmpxchg(l, o, n) \
169 (cmpxchg_local(&((l)->a.counter), (o), (n)))
170/* Always has a lock prefix */
171#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
172
173/**
174 * local_add_unless - add unless the number is a given value
175 * @l: pointer of type local_t
176 * @a: the amount to add to l...
177 * @u: ...unless l is equal to u.
178 *
179 * Atomically adds @a to @l, so long as it was not @u.
180 * Returns non-zero if @l was not @u, and zero otherwise.
181 */
182#define local_add_unless(l, a, u) \
183({ \
184 long c, old; \
185 c = local_read(l); \
186 for (;;) { \
187 if (unlikely(c == (u))) \
188 break; \
189 old = local_cmpxchg((l), c, c + (a)); \
190 if (likely(old == c)) \
191 break; \
192 c = old; \
193 } \
194 c != (u); \
195})
196#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
197
198/* On x86_32, these are no better than the atomic variants.
199 * On x86-64 these are better than the atomic variants on SMP kernels
200 * because they dont use a lock prefix.
201 */
202#define __local_inc(l) local_inc(l)
203#define __local_dec(l) local_dec(l)
204#define __local_add(i, l) local_add((i), (l))
205#define __local_sub(i, l) local_sub((i), (l))
206
207/* Use these for per-cpu local_t variables: on some archs they are
208 * much more efficient than these naive implementations. Note they take
209 * a variable, not an address.
210 *
211 * X86_64: This could be done better if we moved the per cpu data directly
212 * after GS.
213 */
214
215/* Need to disable preemption for the cpu local counters otherwise we could
216 still access a variable of a previous CPU in a non atomic way. */
217#define cpu_local_wrap_v(l) \
218 ({ local_t res__; \
219 preempt_disable(); \
220 res__ = (l); \
221 preempt_enable(); \
222 res__; })
223#define cpu_local_wrap(l) \
224 ({ preempt_disable(); \
225 l; \
226 preempt_enable(); }) \
227
228#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
229#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
230#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
231#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
232#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
233#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
234
235#define __cpu_local_inc(l) cpu_local_inc(l)
236#define __cpu_local_dec(l) cpu_local_dec(l)
237#define __cpu_local_add(i, l) cpu_local_add((i), (l))
238#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
239
240#endif /* _ARCH_LOCAL_H */
diff --git a/include/asm-x86/local_32.h b/include/asm-x86/local_32.h
deleted file mode 100644
index 6e85975b9ed2..000000000000
--- a/include/asm-x86/local_32.h
+++ /dev/null
@@ -1,233 +0,0 @@
1#ifndef _ARCH_I386_LOCAL_H
2#define _ARCH_I386_LOCAL_H
3
4#include <linux/percpu.h>
5#include <asm/system.h>
6#include <asm/atomic.h>
7
8typedef struct
9{
10 atomic_long_t a;
11} local_t;
12
13#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
14
15#define local_read(l) atomic_long_read(&(l)->a)
16#define local_set(l,i) atomic_long_set(&(l)->a, (i))
17
18static __inline__ void local_inc(local_t *l)
19{
20 __asm__ __volatile__(
21 "incl %0"
22 :"+m" (l->a.counter));
23}
24
25static __inline__ void local_dec(local_t *l)
26{
27 __asm__ __volatile__(
28 "decl %0"
29 :"+m" (l->a.counter));
30}
31
32static __inline__ void local_add(long i, local_t *l)
33{
34 __asm__ __volatile__(
35 "addl %1,%0"
36 :"+m" (l->a.counter)
37 :"ir" (i));
38}
39
40static __inline__ void local_sub(long i, local_t *l)
41{
42 __asm__ __volatile__(
43 "subl %1,%0"
44 :"+m" (l->a.counter)
45 :"ir" (i));
46}
47
48/**
49 * local_sub_and_test - subtract value from variable and test result
50 * @i: integer value to subtract
51 * @l: pointer of type local_t
52 *
53 * Atomically subtracts @i from @l and returns
54 * true if the result is zero, or false for all
55 * other cases.
56 */
57static __inline__ int local_sub_and_test(long i, local_t *l)
58{
59 unsigned char c;
60
61 __asm__ __volatile__(
62 "subl %2,%0; sete %1"
63 :"+m" (l->a.counter), "=qm" (c)
64 :"ir" (i) : "memory");
65 return c;
66}
67
68/**
69 * local_dec_and_test - decrement and test
70 * @l: pointer of type local_t
71 *
72 * Atomically decrements @l by 1 and
73 * returns true if the result is 0, or false for all other
74 * cases.
75 */
76static __inline__ int local_dec_and_test(local_t *l)
77{
78 unsigned char c;
79
80 __asm__ __volatile__(
81 "decl %0; sete %1"
82 :"+m" (l->a.counter), "=qm" (c)
83 : : "memory");
84 return c != 0;
85}
86
87/**
88 * local_inc_and_test - increment and test
89 * @l: pointer of type local_t
90 *
91 * Atomically increments @l by 1
92 * and returns true if the result is zero, or false for all
93 * other cases.
94 */
95static __inline__ int local_inc_and_test(local_t *l)
96{
97 unsigned char c;
98
99 __asm__ __volatile__(
100 "incl %0; sete %1"
101 :"+m" (l->a.counter), "=qm" (c)
102 : : "memory");
103 return c != 0;
104}
105
106/**
107 * local_add_negative - add and test if negative
108 * @l: pointer of type local_t
109 * @i: integer value to add
110 *
111 * Atomically adds @i to @l and returns true
112 * if the result is negative, or false when
113 * result is greater than or equal to zero.
114 */
115static __inline__ int local_add_negative(long i, local_t *l)
116{
117 unsigned char c;
118
119 __asm__ __volatile__(
120 "addl %2,%0; sets %1"
121 :"+m" (l->a.counter), "=qm" (c)
122 :"ir" (i) : "memory");
123 return c;
124}
125
126/**
127 * local_add_return - add and return
128 * @l: pointer of type local_t
129 * @i: integer value to add
130 *
131 * Atomically adds @i to @l and returns @i + @l
132 */
133static __inline__ long local_add_return(long i, local_t *l)
134{
135 long __i;
136#ifdef CONFIG_M386
137 unsigned long flags;
138 if(unlikely(boot_cpu_data.x86 <= 3))
139 goto no_xadd;
140#endif
141 /* Modern 486+ processor */
142 __i = i;
143 __asm__ __volatile__(
144 "xaddl %0, %1;"
145 :"+r" (i), "+m" (l->a.counter)
146 : : "memory");
147 return i + __i;
148
149#ifdef CONFIG_M386
150no_xadd: /* Legacy 386 processor */
151 local_irq_save(flags);
152 __i = local_read(l);
153 local_set(l, i + __i);
154 local_irq_restore(flags);
155 return i + __i;
156#endif
157}
158
159static __inline__ long local_sub_return(long i, local_t *l)
160{
161 return local_add_return(-i,l);
162}
163
164#define local_inc_return(l) (local_add_return(1,l))
165#define local_dec_return(l) (local_sub_return(1,l))
166
167#define local_cmpxchg(l, o, n) \
168 (cmpxchg_local(&((l)->a.counter), (o), (n)))
169/* Always has a lock prefix */
170#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
171
172/**
173 * local_add_unless - add unless the number is a given value
174 * @l: pointer of type local_t
175 * @a: the amount to add to l...
176 * @u: ...unless l is equal to u.
177 *
178 * Atomically adds @a to @l, so long as it was not @u.
179 * Returns non-zero if @l was not @u, and zero otherwise.
180 */
181#define local_add_unless(l, a, u) \
182({ \
183 long c, old; \
184 c = local_read(l); \
185 for (;;) { \
186 if (unlikely(c == (u))) \
187 break; \
188 old = local_cmpxchg((l), c, c + (a)); \
189 if (likely(old == c)) \
190 break; \
191 c = old; \
192 } \
193 c != (u); \
194})
195#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
196
197/* On x86, these are no better than the atomic variants. */
198#define __local_inc(l) local_inc(l)
199#define __local_dec(l) local_dec(l)
200#define __local_add(i,l) local_add((i),(l))
201#define __local_sub(i,l) local_sub((i),(l))
202
203/* Use these for per-cpu local_t variables: on some archs they are
204 * much more efficient than these naive implementations. Note they take
205 * a variable, not an address.
206 */
207
208/* Need to disable preemption for the cpu local counters otherwise we could
209 still access a variable of a previous CPU in a non atomic way. */
210#define cpu_local_wrap_v(l) \
211 ({ local_t res__; \
212 preempt_disable(); \
213 res__ = (l); \
214 preempt_enable(); \
215 res__; })
216#define cpu_local_wrap(l) \
217 ({ preempt_disable(); \
218 l; \
219 preempt_enable(); }) \
220
221#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
222#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
223#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
224#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
225#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
226#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
227
228#define __cpu_local_inc(l) cpu_local_inc(l)
229#define __cpu_local_dec(l) cpu_local_dec(l)
230#define __cpu_local_add(i, l) cpu_local_add((i), (l))
231#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
232
233#endif /* _ARCH_I386_LOCAL_H */
diff --git a/include/asm-x86/local_64.h b/include/asm-x86/local_64.h
deleted file mode 100644
index e87492bb0693..000000000000
--- a/include/asm-x86/local_64.h
+++ /dev/null
@@ -1,222 +0,0 @@
1#ifndef _ARCH_X8664_LOCAL_H
2#define _ARCH_X8664_LOCAL_H
3
4#include <linux/percpu.h>
5#include <asm/atomic.h>
6
7typedef struct
8{
9 atomic_long_t a;
10} local_t;
11
12#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
13
14#define local_read(l) atomic_long_read(&(l)->a)
15#define local_set(l,i) atomic_long_set(&(l)->a, (i))
16
17static inline void local_inc(local_t *l)
18{
19 __asm__ __volatile__(
20 "incq %0"
21 :"=m" (l->a.counter)
22 :"m" (l->a.counter));
23}
24
25static inline void local_dec(local_t *l)
26{
27 __asm__ __volatile__(
28 "decq %0"
29 :"=m" (l->a.counter)
30 :"m" (l->a.counter));
31}
32
33static inline void local_add(long i, local_t *l)
34{
35 __asm__ __volatile__(
36 "addq %1,%0"
37 :"=m" (l->a.counter)
38 :"ir" (i), "m" (l->a.counter));
39}
40
41static inline void local_sub(long i, local_t *l)
42{
43 __asm__ __volatile__(
44 "subq %1,%0"
45 :"=m" (l->a.counter)
46 :"ir" (i), "m" (l->a.counter));
47}
48
49/**
50 * local_sub_and_test - subtract value from variable and test result
51 * @i: integer value to subtract
52 * @l: pointer to type local_t
53 *
54 * Atomically subtracts @i from @l and returns
55 * true if the result is zero, or false for all
56 * other cases.
57 */
58static __inline__ int local_sub_and_test(long i, local_t *l)
59{
60 unsigned char c;
61
62 __asm__ __volatile__(
63 "subq %2,%0; sete %1"
64 :"=m" (l->a.counter), "=qm" (c)
65 :"ir" (i), "m" (l->a.counter) : "memory");
66 return c;
67}
68
69/**
70 * local_dec_and_test - decrement and test
71 * @l: pointer to type local_t
72 *
73 * Atomically decrements @l by 1 and
74 * returns true if the result is 0, or false for all other
75 * cases.
76 */
77static __inline__ int local_dec_and_test(local_t *l)
78{
79 unsigned char c;
80
81 __asm__ __volatile__(
82 "decq %0; sete %1"
83 :"=m" (l->a.counter), "=qm" (c)
84 :"m" (l->a.counter) : "memory");
85 return c != 0;
86}
87
88/**
89 * local_inc_and_test - increment and test
90 * @l: pointer to type local_t
91 *
92 * Atomically increments @l by 1
93 * and returns true if the result is zero, or false for all
94 * other cases.
95 */
96static __inline__ int local_inc_and_test(local_t *l)
97{
98 unsigned char c;
99
100 __asm__ __volatile__(
101 "incq %0; sete %1"
102 :"=m" (l->a.counter), "=qm" (c)
103 :"m" (l->a.counter) : "memory");
104 return c != 0;
105}
106
107/**
108 * local_add_negative - add and test if negative
109 * @i: integer value to add
110 * @l: pointer to type local_t
111 *
112 * Atomically adds @i to @l and returns true
113 * if the result is negative, or false when
114 * result is greater than or equal to zero.
115 */
116static __inline__ int local_add_negative(long i, local_t *l)
117{
118 unsigned char c;
119
120 __asm__ __volatile__(
121 "addq %2,%0; sets %1"
122 :"=m" (l->a.counter), "=qm" (c)
123 :"ir" (i), "m" (l->a.counter) : "memory");
124 return c;
125}
126
127/**
128 * local_add_return - add and return
129 * @i: integer value to add
130 * @l: pointer to type local_t
131 *
132 * Atomically adds @i to @l and returns @i + @l
133 */
134static __inline__ long local_add_return(long i, local_t *l)
135{
136 long __i = i;
137 __asm__ __volatile__(
138 "xaddq %0, %1;"
139 :"+r" (i), "+m" (l->a.counter)
140 : : "memory");
141 return i + __i;
142}
143
144static __inline__ long local_sub_return(long i, local_t *l)
145{
146 return local_add_return(-i,l);
147}
148
149#define local_inc_return(l) (local_add_return(1,l))
150#define local_dec_return(l) (local_sub_return(1,l))
151
152#define local_cmpxchg(l, o, n) \
153 (cmpxchg_local(&((l)->a.counter), (o), (n)))
154/* Always has a lock prefix */
155#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
156
157/**
158 * atomic_up_add_unless - add unless the number is a given value
159 * @l: pointer of type local_t
160 * @a: the amount to add to l...
161 * @u: ...unless l is equal to u.
162 *
163 * Atomically adds @a to @l, so long as it was not @u.
164 * Returns non-zero if @l was not @u, and zero otherwise.
165 */
166#define local_add_unless(l, a, u) \
167({ \
168 long c, old; \
169 c = local_read(l); \
170 for (;;) { \
171 if (unlikely(c == (u))) \
172 break; \
173 old = local_cmpxchg((l), c, c + (a)); \
174 if (likely(old == c)) \
175 break; \
176 c = old; \
177 } \
178 c != (u); \
179})
180#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
181
182/* On x86-64 these are better than the atomic variants on SMP kernels
183 because they dont use a lock prefix. */
184#define __local_inc(l) local_inc(l)
185#define __local_dec(l) local_dec(l)
186#define __local_add(i,l) local_add((i),(l))
187#define __local_sub(i,l) local_sub((i),(l))
188
189/* Use these for per-cpu local_t variables: on some archs they are
190 * much more efficient than these naive implementations. Note they take
191 * a variable, not an address.
192 *
193 * This could be done better if we moved the per cpu data directly
194 * after GS.
195 */
196
197/* Need to disable preemption for the cpu local counters otherwise we could
198 still access a variable of a previous CPU in a non atomic way. */
199#define cpu_local_wrap_v(l) \
200 ({ local_t res__; \
201 preempt_disable(); \
202 res__ = (l); \
203 preempt_enable(); \
204 res__; })
205#define cpu_local_wrap(l) \
206 ({ preempt_disable(); \
207 l; \
208 preempt_enable(); }) \
209
210#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
211#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
212#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
213#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
214#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
215#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
216
217#define __cpu_local_inc(l) cpu_local_inc(l)
218#define __cpu_local_dec(l) cpu_local_dec(l)
219#define __cpu_local_add(i, l) cpu_local_add((i), (l))
220#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
221
222#endif /* _ARCH_X8664_LOCAL_H */
diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/mach-bigsmp/mach_apic.h
index ebd319f838ab..6df235e8ea91 100644
--- a/include/asm-x86/mach-bigsmp/mach_apic.h
+++ b/include/asm-x86/mach-bigsmp/mach_apic.h
@@ -110,13 +110,13 @@ static inline int cpu_to_logical_apicid(int cpu)
110} 110}
111 111
112static inline int mpc_apic_id(struct mpc_config_processor *m, 112static inline int mpc_apic_id(struct mpc_config_processor *m,
113 struct mpc_config_translation *translation_record) 113 struct mpc_config_translation *translation_record)
114{ 114{
115 printk("Processor #%d %ld:%ld APIC version %d\n", 115 printk("Processor #%d %u:%u APIC version %d\n",
116 m->mpc_apicid, 116 m->mpc_apicid,
117 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 117 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
118 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 118 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
119 m->mpc_apicver); 119 m->mpc_apicver);
120 return m->mpc_apicid; 120 return m->mpc_apicid;
121} 121}
122 122
diff --git a/include/asm-x86/mach-default/apm.h b/include/asm-x86/mach-default/apm.h
index 1f730b8bd1fd..989f34c37d32 100644
--- a/include/asm-x86/mach-default/apm.h
+++ b/include/asm-x86/mach-default/apm.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/asm-i386/mach-default/apm.h
3 *
4 * Machine specific APM BIOS functions for generic. 2 * Machine specific APM BIOS functions for generic.
5 * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp> 3 * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
6 */ 4 */
diff --git a/include/asm-x86/mach-default/io_ports.h b/include/asm-x86/mach-default/io_ports.h
deleted file mode 100644
index 48540ba97166..000000000000
--- a/include/asm-x86/mach-default/io_ports.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * arch/i386/mach-generic/io_ports.h
3 *
4 * Machine specific IO port address definition for generic.
5 * Written by Osamu Tomita <tomita@cinet.co.jp>
6 */
7#ifndef _MACH_IO_PORTS_H
8#define _MACH_IO_PORTS_H
9
10/* i8259A PIC registers */
11#define PIC_MASTER_CMD 0x20
12#define PIC_MASTER_IMR 0x21
13#define PIC_MASTER_ISR PIC_MASTER_CMD
14#define PIC_MASTER_POLL PIC_MASTER_ISR
15#define PIC_MASTER_OCW3 PIC_MASTER_ISR
16#define PIC_SLAVE_CMD 0xa0
17#define PIC_SLAVE_IMR 0xa1
18
19/* i8259A PIC related value */
20#define PIC_CASCADE_IR 2
21#define MASTER_ICW4_DEFAULT 0x01
22#define SLAVE_ICW4_DEFAULT 0x01
23#define PIC_ICW4_AEOI 2
24
25#endif /* !_MACH_IO_PORTS_H */
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h
index 6db1c3babe9a..e3c2c1012c1c 100644
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -89,15 +89,15 @@ static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
89 return physid_mask_of_physid(phys_apicid); 89 return physid_mask_of_physid(phys_apicid);
90} 90}
91 91
92static inline int mpc_apic_id(struct mpc_config_processor *m, 92static inline int mpc_apic_id(struct mpc_config_processor *m,
93 struct mpc_config_translation *translation_record) 93 struct mpc_config_translation *translation_record)
94{ 94{
95 printk("Processor #%d %ld:%ld APIC version %d\n", 95 printk("Processor #%d %u:%u APIC version %d\n",
96 m->mpc_apicid, 96 m->mpc_apicid,
97 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 97 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
98 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 98 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
99 m->mpc_apicver); 99 m->mpc_apicver);
100 return (m->mpc_apicid); 100 return m->mpc_apicid;
101} 101}
102 102
103static inline void setup_portio_remap(void) 103static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/mach-default/mach_time.h b/include/asm-x86/mach-default/mach_time.h
deleted file mode 100644
index 31eb5de6f3dc..000000000000
--- a/include/asm-x86/mach-default/mach_time.h
+++ /dev/null
@@ -1,111 +0,0 @@
1/*
2 * include/asm-i386/mach-default/mach_time.h
3 *
4 * Machine specific set RTC function for generic.
5 * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
6 */
7#ifndef _MACH_TIME_H
8#define _MACH_TIME_H
9
10#include <linux/mc146818rtc.h>
11
12/* for check timing call set_rtc_mmss() 500ms */
13/* used in arch/i386/time.c::do_timer_interrupt() */
14#define USEC_AFTER 500000
15#define USEC_BEFORE 500000
16
17/*
18 * In order to set the CMOS clock precisely, set_rtc_mmss has to be
19 * called 500 ms after the second nowtime has started, because when
20 * nowtime is written into the registers of the CMOS clock, it will
21 * jump to the next second precisely 500 ms later. Check the Motorola
22 * MC146818A or Dallas DS12887 data sheet for details.
23 *
24 * BUG: This routine does not handle hour overflow properly; it just
25 * sets the minutes. Usually you'll only notice that after reboot!
26 */
27static inline int mach_set_rtc_mmss(unsigned long nowtime)
28{
29 int retval = 0;
30 int real_seconds, real_minutes, cmos_minutes;
31 unsigned char save_control, save_freq_select;
32
33 save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
34 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
35
36 save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
37 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
38
39 cmos_minutes = CMOS_READ(RTC_MINUTES);
40 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
41 BCD_TO_BIN(cmos_minutes);
42
43 /*
44 * since we're only adjusting minutes and seconds,
45 * don't interfere with hour overflow. This avoids
46 * messing with unknown time zones but requires your
47 * RTC not to be off by more than 15 minutes
48 */
49 real_seconds = nowtime % 60;
50 real_minutes = nowtime / 60;
51 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
52 real_minutes += 30; /* correct for half hour time zone */
53 real_minutes %= 60;
54
55 if (abs(real_minutes - cmos_minutes) < 30) {
56 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
57 BIN_TO_BCD(real_seconds);
58 BIN_TO_BCD(real_minutes);
59 }
60 CMOS_WRITE(real_seconds,RTC_SECONDS);
61 CMOS_WRITE(real_minutes,RTC_MINUTES);
62 } else {
63 printk(KERN_WARNING
64 "set_rtc_mmss: can't update from %d to %d\n",
65 cmos_minutes, real_minutes);
66 retval = -1;
67 }
68
69 /* The following flags have to be released exactly in this order,
70 * otherwise the DS12887 (popular MC146818A clone with integrated
71 * battery and quartz) will not reset the oscillator and will not
72 * update precisely 500 ms later. You won't find this mentioned in
73 * the Dallas Semiconductor data sheets, but who believes data
74 * sheets anyway ... -- Markus Kuhn
75 */
76 CMOS_WRITE(save_control, RTC_CONTROL);
77 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
78
79 return retval;
80}
81
82static inline unsigned long mach_get_cmos_time(void)
83{
84 unsigned int year, mon, day, hour, min, sec;
85
86 do {
87 sec = CMOS_READ(RTC_SECONDS);
88 min = CMOS_READ(RTC_MINUTES);
89 hour = CMOS_READ(RTC_HOURS);
90 day = CMOS_READ(RTC_DAY_OF_MONTH);
91 mon = CMOS_READ(RTC_MONTH);
92 year = CMOS_READ(RTC_YEAR);
93 } while (sec != CMOS_READ(RTC_SECONDS));
94
95 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
96 BCD_TO_BIN(sec);
97 BCD_TO_BIN(min);
98 BCD_TO_BIN(hour);
99 BCD_TO_BIN(day);
100 BCD_TO_BIN(mon);
101 BCD_TO_BIN(year);
102 }
103
104 year += 1900;
105 if (year < 1970)
106 year += 100;
107
108 return mktime(year, mon, day, hour, min, sec);
109}
110
111#endif /* !_MACH_TIME_H */
diff --git a/include/asm-x86/mach-default/mach_timer.h b/include/asm-x86/mach-default/mach_timer.h
index 807992fd4171..4b76e536cd98 100644
--- a/include/asm-x86/mach-default/mach_timer.h
+++ b/include/asm-x86/mach-default/mach_timer.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/asm-i386/mach-default/mach_timer.h
3 *
4 * Machine specific calibrate_tsc() for generic. 2 * Machine specific calibrate_tsc() for generic.
5 * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp> 3 * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
6 */ 4 */
diff --git a/include/asm-x86/mach-default/mach_traps.h b/include/asm-x86/mach-default/mach_traps.h
index 625438b8a6eb..2fe7705c0484 100644
--- a/include/asm-x86/mach-default/mach_traps.h
+++ b/include/asm-x86/mach-default/mach_traps.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/asm-i386/mach-default/mach_traps.h
3 *
4 * Machine specific NMI handling for generic. 2 * Machine specific NMI handling for generic.
5 * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp> 3 * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
6 */ 4 */
diff --git a/include/asm-x86/mach-es7000/mach_apic.h b/include/asm-x86/mach-es7000/mach_apic.h
index caec64be516d..d23011fdf454 100644
--- a/include/asm-x86/mach-es7000/mach_apic.h
+++ b/include/asm-x86/mach-es7000/mach_apic.h
@@ -131,11 +131,11 @@ static inline int cpu_to_logical_apicid(int cpu)
131 131
132static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused) 132static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused)
133{ 133{
134 printk("Processor #%d %ld:%ld APIC version %d\n", 134 printk("Processor #%d %u:%u APIC version %d\n",
135 m->mpc_apicid, 135 m->mpc_apicid,
136 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 136 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
137 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 137 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
138 m->mpc_apicver); 138 m->mpc_apicver);
139 return (m->mpc_apicid); 139 return (m->mpc_apicid);
140} 140}
141 141
diff --git a/include/asm-x86/mach-generic/gpio.h b/include/asm-x86/mach-generic/gpio.h
new file mode 100644
index 000000000000..5305dcb96df2
--- /dev/null
+++ b/include/asm-x86/mach-generic/gpio.h
@@ -0,0 +1,15 @@
1#ifndef __ASM_MACH_GENERIC_GPIO_H
2#define __ASM_MACH_GENERIC_GPIO_H
3
4int gpio_request(unsigned gpio, const char *label);
5void gpio_free(unsigned gpio);
6int gpio_direction_input(unsigned gpio);
7int gpio_direction_output(unsigned gpio, int value);
8int gpio_get_value(unsigned gpio);
9void gpio_set_value(unsigned gpio, int value);
10int gpio_to_irq(unsigned gpio);
11int irq_to_gpio(unsigned irq);
12
13#include <asm-generic/gpio.h> /* cansleep wrappers */
14
15#endif /* __ASM_MACH_GENERIC_GPIO_H */
diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h
index 5e5e7dd2692e..17e183bd39c1 100644
--- a/include/asm-x86/mach-numaq/mach_apic.h
+++ b/include/asm-x86/mach-numaq/mach_apic.h
@@ -101,11 +101,11 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
101 int quad = translation_record->trans_quad; 101 int quad = translation_record->trans_quad;
102 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); 102 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
103 103
104 printk("Processor #%d %ld:%ld APIC version %d (quad %d, apic %d)\n", 104 printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
105 m->mpc_apicid, 105 m->mpc_apicid,
106 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 106 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
107 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 107 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
108 m->mpc_apicver, quad, logical_apicid); 108 m->mpc_apicver, quad, logical_apicid);
109 return logical_apicid; 109 return logical_apicid;
110} 110}
111 111
diff --git a/include/asm-x86/mach-rdc321x/gpio.h b/include/asm-x86/mach-rdc321x/gpio.h
new file mode 100644
index 000000000000..db31b929b990
--- /dev/null
+++ b/include/asm-x86/mach-rdc321x/gpio.h
@@ -0,0 +1,56 @@
1#ifndef _RDC321X_GPIO_H
2#define _RDC321X_GPIO_H
3
4extern int rdc_gpio_get_value(unsigned gpio);
5extern void rdc_gpio_set_value(unsigned gpio, int value);
6extern int rdc_gpio_direction_input(unsigned gpio);
7extern int rdc_gpio_direction_output(unsigned gpio, int value);
8
9
10/* Wrappers for the arch-neutral GPIO API */
11
12static inline int gpio_request(unsigned gpio, const char *label)
13{
14 /* Not yet implemented */
15 return 0;
16}
17
18static inline void gpio_free(unsigned gpio)
19{
20 /* Not yet implemented */
21}
22
23static inline int gpio_direction_input(unsigned gpio)
24{
25 return rdc_gpio_direction_input(gpio);
26}
27
28static inline int gpio_direction_output(unsigned gpio, int value)
29{
30 return rdc_gpio_direction_output(gpio, value);
31}
32
33static inline int gpio_get_value(unsigned gpio)
34{
35 return rdc_gpio_get_value(gpio);
36}
37
38static inline void gpio_set_value(unsigned gpio, int value)
39{
40 rdc_gpio_set_value(gpio, value);
41}
42
43static inline int gpio_to_irq(unsigned gpio)
44{
45 return gpio;
46}
47
48static inline int irq_to_gpio(unsigned irq)
49{
50 return irq;
51}
52
53/* For cansleep */
54#include <asm-generic/gpio.h>
55
56#endif /* _RDC321X_GPIO_H_ */
diff --git a/include/asm-x86/mach-rdc321x/rdc321x_defs.h b/include/asm-x86/mach-rdc321x/rdc321x_defs.h
new file mode 100644
index 000000000000..838ba8f64fd3
--- /dev/null
+++ b/include/asm-x86/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,6 @@
1#define PFX "rdc321x: "
2
3/* General purpose configuration and data registers */
4#define RDC3210_CFGREG_ADDR 0x0CF8
5#define RDC3210_CFGREG_DATA 0x0CFC
6#define RDC_MAX_GPIO 0x3A
diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/mach-summit/mach_apic.h
index 732f776aab8e..062c97f6100b 100644
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/mach-summit/mach_apic.h
@@ -126,15 +126,15 @@ static inline physid_mask_t apicid_to_cpu_present(int apicid)
126 return physid_mask_of_physid(0); 126 return physid_mask_of_physid(0);
127} 127}
128 128
129static inline int mpc_apic_id(struct mpc_config_processor *m, 129static inline int mpc_apic_id(struct mpc_config_processor *m,
130 struct mpc_config_translation *translation_record) 130 struct mpc_config_translation *translation_record)
131{ 131{
132 printk("Processor #%d %ld:%ld APIC version %d\n", 132 printk("Processor #%d %u:%u APIC version %d\n",
133 m->mpc_apicid, 133 m->mpc_apicid,
134 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, 134 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
135 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, 135 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
136 m->mpc_apicver); 136 m->mpc_apicver);
137 return (m->mpc_apicid); 137 return m->mpc_apicid;
138} 138}
139 139
140static inline void setup_portio_remap(void) 140static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/math_emu.h b/include/asm-x86/math_emu.h
index a4b0aa3320e6..9bf4ae93ab10 100644
--- a/include/asm-x86/math_emu.h
+++ b/include/asm-x86/math_emu.h
@@ -1,11 +1,6 @@
1#ifndef _I386_MATH_EMU_H 1#ifndef _I386_MATH_EMU_H
2#define _I386_MATH_EMU_H 2#define _I386_MATH_EMU_H
3 3
4#include <asm/sigcontext.h>
5
6int restore_i387_soft(void *s387, struct _fpstate __user *buf);
7int save_i387_soft(void *s387, struct _fpstate __user *buf);
8
9/* This structure matches the layout of the data saved to the stack 4/* This structure matches the layout of the data saved to the stack
10 following a device-not-present interrupt, part of it saved 5 following a device-not-present interrupt, part of it saved
11 automatically by the 80386/80486. 6 automatically by the 80386/80486.
diff --git a/include/asm-x86/mc146818rtc.h b/include/asm-x86/mc146818rtc.h
index 5c2bb66caf17..cdd9f965835a 100644
--- a/include/asm-x86/mc146818rtc.h
+++ b/include/asm-x86/mc146818rtc.h
@@ -1,5 +1,100 @@
1#ifdef CONFIG_X86_32 1/*
2# include "mc146818rtc_32.h" 2 * Machine dependent access functions for RTC registers.
3 */
4#ifndef _ASM_MC146818RTC_H
5#define _ASM_MC146818RTC_H
6
7#include <asm/io.h>
8#include <asm/system.h>
9#include <asm/processor.h>
10#include <linux/mc146818rtc.h>
11
12#ifndef RTC_PORT
13#define RTC_PORT(x) (0x70 + (x))
14#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
15#endif
16
17#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
18/*
19 * This lock provides nmi access to the CMOS/RTC registers. It has some
20 * special properties. It is owned by a CPU and stores the index register
21 * currently being accessed (if owned). The idea here is that it works
22 * like a normal lock (normally). However, in an NMI, the NMI code will
23 * first check to see if its CPU owns the lock, meaning that the NMI
24 * interrupted during the read/write of the device. If it does, it goes ahead
25 * and performs the access and then restores the index register. If it does
26 * not, it locks normally.
27 *
28 * Note that since we are working with NMIs, we need this lock even in
29 * a non-SMP machine just to mark that the lock is owned.
30 *
31 * This only works with compare-and-swap. There is no other way to
32 * atomically claim the lock and set the owner.
33 */
34#include <linux/smp.h>
35extern volatile unsigned long cmos_lock;
36
37/*
38 * All of these below must be called with interrupts off, preempt
39 * disabled, etc.
40 */
41
42static inline void lock_cmos(unsigned char reg)
43{
44 unsigned long new;
45 new = ((smp_processor_id()+1) << 8) | reg;
46 for (;;) {
47 if (cmos_lock) {
48 cpu_relax();
49 continue;
50 }
51 if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
52 return;
53 }
54}
55
56static inline void unlock_cmos(void)
57{
58 cmos_lock = 0;
59}
60static inline int do_i_have_lock_cmos(void)
61{
62 return (cmos_lock >> 8) == (smp_processor_id()+1);
63}
64static inline unsigned char current_lock_cmos_reg(void)
65{
66 return cmos_lock & 0xff;
67}
68#define lock_cmos_prefix(reg) \
69 do { \
70 unsigned long cmos_flags; \
71 local_irq_save(cmos_flags); \
72 lock_cmos(reg)
73#define lock_cmos_suffix(reg) \
74 unlock_cmos(); \
75 local_irq_restore(cmos_flags); \
76 } while (0)
3#else 77#else
4# include "mc146818rtc_64.h" 78#define lock_cmos_prefix(reg) do {} while (0)
79#define lock_cmos_suffix(reg) do {} while (0)
80#define lock_cmos(reg)
81#define unlock_cmos()
82#define do_i_have_lock_cmos() 0
83#define current_lock_cmos_reg() 0
5#endif 84#endif
85
86/*
87 * The yet supported machines all access the RTC index register via
88 * an ISA port access but the way to access the date register differs ...
89 */
90#define CMOS_READ(addr) rtc_cmos_read(addr)
91#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
92unsigned char rtc_cmos_read(unsigned char addr);
93void rtc_cmos_write(unsigned char val, unsigned char addr);
94
95extern int mach_set_rtc_mmss(unsigned long nowtime);
96extern unsigned long mach_get_cmos_time(void);
97
98#define RTC_IRQ 8
99
100#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_32.h b/include/asm-x86/mc146818rtc_32.h
deleted file mode 100644
index 1613b42eaf58..000000000000
--- a/include/asm-x86/mc146818rtc_32.h
+++ /dev/null
@@ -1,97 +0,0 @@
1/*
2 * Machine dependent access functions for RTC registers.
3 */
4#ifndef _ASM_MC146818RTC_H
5#define _ASM_MC146818RTC_H
6
7#include <asm/io.h>
8#include <asm/system.h>
9#include <asm/processor.h>
10#include <linux/mc146818rtc.h>
11
12#ifndef RTC_PORT
13#define RTC_PORT(x) (0x70 + (x))
14#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
15#endif
16
17#ifdef __HAVE_ARCH_CMPXCHG
18/*
19 * This lock provides nmi access to the CMOS/RTC registers. It has some
20 * special properties. It is owned by a CPU and stores the index register
21 * currently being accessed (if owned). The idea here is that it works
22 * like a normal lock (normally). However, in an NMI, the NMI code will
23 * first check to see if its CPU owns the lock, meaning that the NMI
24 * interrupted during the read/write of the device. If it does, it goes ahead
25 * and performs the access and then restores the index register. If it does
26 * not, it locks normally.
27 *
28 * Note that since we are working with NMIs, we need this lock even in
29 * a non-SMP machine just to mark that the lock is owned.
30 *
31 * This only works with compare-and-swap. There is no other way to
32 * atomically claim the lock and set the owner.
33 */
34#include <linux/smp.h>
35extern volatile unsigned long cmos_lock;
36
37/*
38 * All of these below must be called with interrupts off, preempt
39 * disabled, etc.
40 */
41
42static inline void lock_cmos(unsigned char reg)
43{
44 unsigned long new;
45 new = ((smp_processor_id()+1) << 8) | reg;
46 for (;;) {
47 if (cmos_lock) {
48 cpu_relax();
49 continue;
50 }
51 if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
52 return;
53 }
54}
55
56static inline void unlock_cmos(void)
57{
58 cmos_lock = 0;
59}
60static inline int do_i_have_lock_cmos(void)
61{
62 return (cmos_lock >> 8) == (smp_processor_id()+1);
63}
64static inline unsigned char current_lock_cmos_reg(void)
65{
66 return cmos_lock & 0xff;
67}
68#define lock_cmos_prefix(reg) \
69 do { \
70 unsigned long cmos_flags; \
71 local_irq_save(cmos_flags); \
72 lock_cmos(reg)
73#define lock_cmos_suffix(reg) \
74 unlock_cmos(); \
75 local_irq_restore(cmos_flags); \
76 } while (0)
77#else
78#define lock_cmos_prefix(reg) do {} while (0)
79#define lock_cmos_suffix(reg) do {} while (0)
80#define lock_cmos(reg)
81#define unlock_cmos()
82#define do_i_have_lock_cmos() 0
83#define current_lock_cmos_reg() 0
84#endif
85
86/*
87 * The yet supported machines all access the RTC index register via
88 * an ISA port access but the way to access the date register differs ...
89 */
90#define CMOS_READ(addr) rtc_cmos_read(addr)
91#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
92unsigned char rtc_cmos_read(unsigned char addr);
93void rtc_cmos_write(unsigned char val, unsigned char addr);
94
95#define RTC_IRQ 8
96
97#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_64.h b/include/asm-x86/mc146818rtc_64.h
deleted file mode 100644
index d6e3009430c1..000000000000
--- a/include/asm-x86/mc146818rtc_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Machine dependent access functions for RTC registers.
3 */
4#ifndef _ASM_MC146818RTC_H
5#define _ASM_MC146818RTC_H
6
7#include <asm/io.h>
8
9#ifndef RTC_PORT
10#define RTC_PORT(x) (0x70 + (x))
11#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
12#endif
13
14/*
15 * The yet supported machines all access the RTC index register via
16 * an ISA port access but the way to access the date register differs ...
17 */
18#define CMOS_READ(addr) ({ \
19outb_p((addr),RTC_PORT(0)); \
20inb_p(RTC_PORT(1)); \
21})
22#define CMOS_WRITE(val, addr) ({ \
23outb_p((addr),RTC_PORT(0)); \
24outb_p((val),RTC_PORT(1)); \
25})
26
27#define RTC_IRQ 8
28
29#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mce.h b/include/asm-x86/mce.h
index df304fd89c27..94f1fd79e22a 100644
--- a/include/asm-x86/mce.h
+++ b/include/asm-x86/mce.h
@@ -13,7 +13,7 @@
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14 14
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* eip points to correct instruction */ 16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
17#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ 17#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
18 18
19#define MCI_STATUS_VAL (1UL<<63) /* valid error */ 19#define MCI_STATUS_VAL (1UL<<63) /* valid error */
@@ -30,7 +30,7 @@ struct mce {
30 __u64 misc; 30 __u64 misc;
31 __u64 addr; 31 __u64 addr;
32 __u64 mcgstatus; 32 __u64 mcgstatus;
33 __u64 rip; 33 __u64 ip;
34 __u64 tsc; /* cpu time stamp counter */ 34 __u64 tsc; /* cpu time stamp counter */
35 __u64 res1; /* for future extension */ 35 __u64 res1; /* for future extension */
36 __u64 res2; /* dito. */ 36 __u64 res2; /* dito. */
@@ -85,14 +85,7 @@ struct mce_log {
85#ifdef __KERNEL__ 85#ifdef __KERNEL__
86 86
87#ifdef CONFIG_X86_32 87#ifdef CONFIG_X86_32
88#ifdef CONFIG_X86_MCE
89extern void mcheck_init(struct cpuinfo_x86 *c);
90#else
91#define mcheck_init(c) do {} while(0)
92#endif
93
94extern int mce_disabled; 88extern int mce_disabled;
95
96#else /* CONFIG_X86_32 */ 89#else /* CONFIG_X86_32 */
97 90
98#include <asm/atomic.h> 91#include <asm/atomic.h>
@@ -121,6 +114,13 @@ extern int mce_notify_user(void);
121 114
122#endif /* !CONFIG_X86_32 */ 115#endif /* !CONFIG_X86_32 */
123 116
117
118
119#ifdef CONFIG_X86_MCE
120extern void mcheck_init(struct cpuinfo_x86 *c);
121#else
122#define mcheck_init(c) do { } while (0)
123#endif
124extern void stop_mce(void); 124extern void stop_mce(void);
125extern void restart_mce(void); 125extern void restart_mce(void);
126 126
diff --git a/include/asm-x86/mmsegment.h b/include/asm-x86/mmsegment.h
deleted file mode 100644
index d3f80c996330..000000000000
--- a/include/asm-x86/mmsegment.h
+++ /dev/null
@@ -1,8 +0,0 @@
1#ifndef _ASM_MMSEGMENT_H
2#define _ASM_MMSEGMENT_H 1
3
4typedef struct {
5 unsigned long seg;
6} mm_segment_t;
7
8#endif
diff --git a/include/asm-x86/mmu.h b/include/asm-x86/mmu.h
index 3f922c8e1c88..efa962c38897 100644
--- a/include/asm-x86/mmu.h
+++ b/include/asm-x86/mmu.h
@@ -20,4 +20,12 @@ typedef struct {
20 void *vdso; 20 void *vdso;
21} mm_context_t; 21} mm_context_t;
22 22
23#ifdef CONFIG_SMP
24void leave_mm(int cpu);
25#else
26static inline void leave_mm(int cpu)
27{
28}
29#endif
30
23#endif /* _ASM_X86_MMU_H */ 31#endif /* _ASM_X86_MMU_H */
diff --git a/include/asm-x86/mmu_context_32.h b/include/asm-x86/mmu_context_32.h
index 7eb0b0b1fb3c..8198d1cca1f3 100644
--- a/include/asm-x86/mmu_context_32.h
+++ b/include/asm-x86/mmu_context_32.h
@@ -32,8 +32,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
32#endif 32#endif
33} 33}
34 34
35void leave_mm(unsigned long cpu);
36
37static inline void switch_mm(struct mm_struct *prev, 35static inline void switch_mm(struct mm_struct *prev,
38 struct mm_struct *next, 36 struct mm_struct *next,
39 struct task_struct *tsk) 37 struct task_struct *tsk)
diff --git a/include/asm-x86/mmu_context_64.h b/include/asm-x86/mmu_context_64.h
index 0cce83a78378..ad6dc821ef9e 100644
--- a/include/asm-x86/mmu_context_64.h
+++ b/include/asm-x86/mmu_context_64.h
@@ -7,7 +7,9 @@
7#include <asm/pda.h> 7#include <asm/pda.h>
8#include <asm/pgtable.h> 8#include <asm/pgtable.h>
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#ifndef CONFIG_PARAVIRT
10#include <asm-generic/mm_hooks.h> 11#include <asm-generic/mm_hooks.h>
12#endif
11 13
12/* 14/*
13 * possibly do the LDT unload here? 15 * possibly do the LDT unload here?
@@ -23,11 +25,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
23#endif 25#endif
24} 26}
25 27
26static inline void load_cr3(pgd_t *pgd)
27{
28 asm volatile("movq %0,%%cr3" :: "r" (__pa(pgd)) : "memory");
29}
30
31static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 28static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
32 struct task_struct *tsk) 29 struct task_struct *tsk)
33{ 30{
@@ -43,20 +40,20 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
43 load_cr3(next->pgd); 40 load_cr3(next->pgd);
44 41
45 if (unlikely(next->context.ldt != prev->context.ldt)) 42 if (unlikely(next->context.ldt != prev->context.ldt))
46 load_LDT_nolock(&next->context, cpu); 43 load_LDT_nolock(&next->context);
47 } 44 }
48#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
49 else { 46 else {
50 write_pda(mmu_state, TLBSTATE_OK); 47 write_pda(mmu_state, TLBSTATE_OK);
51 if (read_pda(active_mm) != next) 48 if (read_pda(active_mm) != next)
52 out_of_line_bug(); 49 BUG();
53 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { 50 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
54 /* We were in lazy tlb mode and leave_mm disabled 51 /* We were in lazy tlb mode and leave_mm disabled
55 * tlb flush IPI delivery. We must reload CR3 52 * tlb flush IPI delivery. We must reload CR3
56 * to make sure to use no freed page tables. 53 * to make sure to use no freed page tables.
57 */ 54 */
58 load_cr3(next->pgd); 55 load_cr3(next->pgd);
59 load_LDT_nolock(&next->context, cpu); 56 load_LDT_nolock(&next->context);
60 } 57 }
61 } 58 }
62#endif 59#endif
diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h
index 118e9812778f..5d6f4ce6e6d6 100644
--- a/include/asm-x86/mmzone_32.h
+++ b/include/asm-x86/mmzone_32.h
@@ -87,9 +87,6 @@ static inline int pfn_to_nid(unsigned long pfn)
87 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ 87 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
88}) 88})
89 89
90/* XXX: FIXME -- wli */
91#define kern_addr_valid(kaddr) (0)
92
93#ifdef CONFIG_X86_NUMAQ /* we have contiguous memory on NUMA-Q */ 90#ifdef CONFIG_X86_NUMAQ /* we have contiguous memory on NUMA-Q */
94#define pfn_valid(pfn) ((pfn) < num_physpages) 91#define pfn_valid(pfn) ((pfn) < num_physpages)
95#else 92#else
diff --git a/include/asm-x86/mmzone_64.h b/include/asm-x86/mmzone_64.h
index 19a89377b123..ebaf9663aa8a 100644
--- a/include/asm-x86/mmzone_64.h
+++ b/include/asm-x86/mmzone_64.h
@@ -15,9 +15,9 @@
15struct memnode { 15struct memnode {
16 int shift; 16 int shift;
17 unsigned int mapsize; 17 unsigned int mapsize;
18 u8 *map; 18 s16 *map;
19 u8 embedded_map[64-16]; 19 s16 embedded_map[64-8];
20} ____cacheline_aligned; /* total size = 64 bytes */ 20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode; 21extern struct memnode memnode;
22#define memnode_shift memnode.shift 22#define memnode_shift memnode.shift
23#define memnodemap memnode.map 23#define memnodemap memnode.map
@@ -41,11 +41,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
41#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 41#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
42 NODE_DATA(nid)->node_spanned_pages) 42 NODE_DATA(nid)->node_spanned_pages)
43 43
44#ifdef CONFIG_DISCONTIGMEM 44extern int early_pfn_to_nid(unsigned long pfn);
45#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
46
47extern int pfn_valid(unsigned long pfn);
48#endif
49 45
50#ifdef CONFIG_NUMA_EMU 46#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024) 47#define FAKE_NODE_MIN_SIZE (64*1024*1024)
diff --git a/include/asm-x86/module.h b/include/asm-x86/module.h
index 2b2f18d8a531..bfedb247871c 100644
--- a/include/asm-x86/module.h
+++ b/include/asm-x86/module.h
@@ -1,5 +1,82 @@
1#ifndef _ASM_MODULE_H
2#define _ASM_MODULE_H
3
4/* x86_32/64 are simple */
5struct mod_arch_specific {};
6
1#ifdef CONFIG_X86_32 7#ifdef CONFIG_X86_32
2# include "module_32.h" 8# define Elf_Shdr Elf32_Shdr
9# define Elf_Sym Elf32_Sym
10# define Elf_Ehdr Elf32_Ehdr
3#else 11#else
4# include "module_64.h" 12# define Elf_Shdr Elf64_Shdr
13# define Elf_Sym Elf64_Sym
14# define Elf_Ehdr Elf64_Ehdr
5#endif 15#endif
16
17#ifdef CONFIG_X86_64
18/* X86_64 does not define MODULE_PROC_FAMILY */
19#elif defined CONFIG_M386
20#define MODULE_PROC_FAMILY "386 "
21#elif defined CONFIG_M486
22#define MODULE_PROC_FAMILY "486 "
23#elif defined CONFIG_M586
24#define MODULE_PROC_FAMILY "586 "
25#elif defined CONFIG_M586TSC
26#define MODULE_PROC_FAMILY "586TSC "
27#elif defined CONFIG_M586MMX
28#define MODULE_PROC_FAMILY "586MMX "
29#elif defined CONFIG_MCORE2
30#define MODULE_PROC_FAMILY "CORE2 "
31#elif defined CONFIG_M686
32#define MODULE_PROC_FAMILY "686 "
33#elif defined CONFIG_MPENTIUMII
34#define MODULE_PROC_FAMILY "PENTIUMII "
35#elif defined CONFIG_MPENTIUMIII
36#define MODULE_PROC_FAMILY "PENTIUMIII "
37#elif defined CONFIG_MPENTIUMM
38#define MODULE_PROC_FAMILY "PENTIUMM "
39#elif defined CONFIG_MPENTIUM4
40#define MODULE_PROC_FAMILY "PENTIUM4 "
41#elif defined CONFIG_MK6
42#define MODULE_PROC_FAMILY "K6 "
43#elif defined CONFIG_MK7
44#define MODULE_PROC_FAMILY "K7 "
45#elif defined CONFIG_MK8
46#define MODULE_PROC_FAMILY "K8 "
47#elif defined CONFIG_X86_ELAN
48#define MODULE_PROC_FAMILY "ELAN "
49#elif defined CONFIG_MCRUSOE
50#define MODULE_PROC_FAMILY "CRUSOE "
51#elif defined CONFIG_MEFFICEON
52#define MODULE_PROC_FAMILY "EFFICEON "
53#elif defined CONFIG_MWINCHIPC6
54#define MODULE_PROC_FAMILY "WINCHIPC6 "
55#elif defined CONFIG_MWINCHIP2
56#define MODULE_PROC_FAMILY "WINCHIP2 "
57#elif defined CONFIG_MWINCHIP3D
58#define MODULE_PROC_FAMILY "WINCHIP3D "
59#elif defined CONFIG_MCYRIXIII
60#define MODULE_PROC_FAMILY "CYRIXIII "
61#elif defined CONFIG_MVIAC3_2
62#define MODULE_PROC_FAMILY "VIAC3-2 "
63#elif defined CONFIG_MVIAC7
64#define MODULE_PROC_FAMILY "VIAC7 "
65#elif defined CONFIG_MGEODEGX1
66#define MODULE_PROC_FAMILY "GEODEGX1 "
67#elif defined CONFIG_MGEODE_LX
68#define MODULE_PROC_FAMILY "GEODE "
69#else
70#error unknown processor family
71#endif
72
73#ifdef CONFIG_X86_32
74# ifdef CONFIG_4KSTACKS
75# define MODULE_STACKSIZE "4KSTACKS "
76# else
77# define MODULE_STACKSIZE ""
78# endif
79# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
80#endif
81
82#endif /* _ASM_MODULE_H */
diff --git a/include/asm-x86/module_32.h b/include/asm-x86/module_32.h
deleted file mode 100644
index 7e5fda6c3976..000000000000
--- a/include/asm-x86/module_32.h
+++ /dev/null
@@ -1,75 +0,0 @@
1#ifndef _ASM_I386_MODULE_H
2#define _ASM_I386_MODULE_H
3
4/* x86 is simple */
5struct mod_arch_specific
6{
7};
8
9#define Elf_Shdr Elf32_Shdr
10#define Elf_Sym Elf32_Sym
11#define Elf_Ehdr Elf32_Ehdr
12
13#ifdef CONFIG_M386
14#define MODULE_PROC_FAMILY "386 "
15#elif defined CONFIG_M486
16#define MODULE_PROC_FAMILY "486 "
17#elif defined CONFIG_M586
18#define MODULE_PROC_FAMILY "586 "
19#elif defined CONFIG_M586TSC
20#define MODULE_PROC_FAMILY "586TSC "
21#elif defined CONFIG_M586MMX
22#define MODULE_PROC_FAMILY "586MMX "
23#elif defined CONFIG_MCORE2
24#define MODULE_PROC_FAMILY "CORE2 "
25#elif defined CONFIG_M686
26#define MODULE_PROC_FAMILY "686 "
27#elif defined CONFIG_MPENTIUMII
28#define MODULE_PROC_FAMILY "PENTIUMII "
29#elif defined CONFIG_MPENTIUMIII
30#define MODULE_PROC_FAMILY "PENTIUMIII "
31#elif defined CONFIG_MPENTIUMM
32#define MODULE_PROC_FAMILY "PENTIUMM "
33#elif defined CONFIG_MPENTIUM4
34#define MODULE_PROC_FAMILY "PENTIUM4 "
35#elif defined CONFIG_MK6
36#define MODULE_PROC_FAMILY "K6 "
37#elif defined CONFIG_MK7
38#define MODULE_PROC_FAMILY "K7 "
39#elif defined CONFIG_MK8
40#define MODULE_PROC_FAMILY "K8 "
41#elif defined CONFIG_X86_ELAN
42#define MODULE_PROC_FAMILY "ELAN "
43#elif defined CONFIG_MCRUSOE
44#define MODULE_PROC_FAMILY "CRUSOE "
45#elif defined CONFIG_MEFFICEON
46#define MODULE_PROC_FAMILY "EFFICEON "
47#elif defined CONFIG_MWINCHIPC6
48#define MODULE_PROC_FAMILY "WINCHIPC6 "
49#elif defined CONFIG_MWINCHIP2
50#define MODULE_PROC_FAMILY "WINCHIP2 "
51#elif defined CONFIG_MWINCHIP3D
52#define MODULE_PROC_FAMILY "WINCHIP3D "
53#elif defined CONFIG_MCYRIXIII
54#define MODULE_PROC_FAMILY "CYRIXIII "
55#elif defined CONFIG_MVIAC3_2
56#define MODULE_PROC_FAMILY "VIAC3-2 "
57#elif defined CONFIG_MVIAC7
58#define MODULE_PROC_FAMILY "VIAC7 "
59#elif defined CONFIG_MGEODEGX1
60#define MODULE_PROC_FAMILY "GEODEGX1 "
61#elif defined CONFIG_MGEODE_LX
62#define MODULE_PROC_FAMILY "GEODE "
63#else
64#error unknown processor family
65#endif
66
67#ifdef CONFIG_4KSTACKS
68#define MODULE_STACKSIZE "4KSTACKS "
69#else
70#define MODULE_STACKSIZE ""
71#endif
72
73#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
74
75#endif /* _ASM_I386_MODULE_H */
diff --git a/include/asm-x86/module_64.h b/include/asm-x86/module_64.h
deleted file mode 100644
index 67f8f69fa7b1..000000000000
--- a/include/asm-x86/module_64.h
+++ /dev/null
@@ -1,10 +0,0 @@
1#ifndef _ASM_X8664_MODULE_H
2#define _ASM_X8664_MODULE_H
3
4struct mod_arch_specific {};
5
6#define Elf_Shdr Elf64_Shdr
7#define Elf_Sym Elf64_Sym
8#define Elf_Ehdr Elf64_Ehdr
9
10#endif
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 8f268e8fd2e9..781ad74ab9e9 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -1,5 +1,117 @@
1#ifndef _AM_X86_MPSPEC_H
2#define _AM_X86_MPSPEC_H
3
4#include <asm/mpspec_def.h>
5
1#ifdef CONFIG_X86_32 6#ifdef CONFIG_X86_32
2# include "mpspec_32.h" 7#include <mach_mpspec.h>
8
9extern int mp_bus_id_to_type[MAX_MP_BUSSES];
10extern int mp_bus_id_to_node[MAX_MP_BUSSES];
11extern int mp_bus_id_to_local[MAX_MP_BUSSES];
12extern int quad_local_to_mp_bus_id[NR_CPUS/4][4];
13
14extern unsigned int def_to_bigsmp;
15extern int apic_version[MAX_APICS];
16extern u8 apicid_2_node[];
17extern int pic_mode;
18
19#define MAX_APICID 256
20
3#else 21#else
4# include "mpspec_64.h" 22
23#define MAX_MP_BUSSES 256
24/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
25#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
26
27extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
28
29#endif
30
31extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
32
33extern unsigned int boot_cpu_physical_apicid;
34extern int smp_found_config;
35extern int nr_ioapics;
36extern int mp_irq_entries;
37extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
38extern int mpc_default_type;
39extern unsigned long mp_lapic_addr;
40
41extern void find_smp_config(void);
42extern void get_smp_config(void);
43
44#ifdef CONFIG_ACPI
45extern void mp_register_lapic(u8 id, u8 enabled);
46extern void mp_register_lapic_address(u64 address);
47extern void mp_register_ioapic(u8 id, u32 address, u32 gsi_base);
48extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
49 u32 gsi);
50extern void mp_config_acpi_legacy_irqs(void);
51extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
52#endif /* CONFIG_ACPI */
53
54#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
55
56struct physid_mask
57{
58 unsigned long mask[PHYSID_ARRAY_SIZE];
59};
60
61typedef struct physid_mask physid_mask_t;
62
63#define physid_set(physid, map) set_bit(physid, (map).mask)
64#define physid_clear(physid, map) clear_bit(physid, (map).mask)
65#define physid_isset(physid, map) test_bit(physid, (map).mask)
66#define physid_test_and_set(physid, map) \
67 test_and_set_bit(physid, (map).mask)
68
69#define physids_and(dst, src1, src2) \
70 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
71
72#define physids_or(dst, src1, src2) \
73 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
74
75#define physids_clear(map) \
76 bitmap_zero((map).mask, MAX_APICS)
77
78#define physids_complement(dst, src) \
79 bitmap_complement((dst).mask, (src).mask, MAX_APICS)
80
81#define physids_empty(map) \
82 bitmap_empty((map).mask, MAX_APICS)
83
84#define physids_equal(map1, map2) \
85 bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
86
87#define physids_weight(map) \
88 bitmap_weight((map).mask, MAX_APICS)
89
90#define physids_shift_right(d, s, n) \
91 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
92
93#define physids_shift_left(d, s, n) \
94 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
95
96#define physids_coerce(map) ((map).mask[0])
97
98#define physids_promote(physids) \
99 ({ \
100 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
101 __physid_mask.mask[0] = physids; \
102 __physid_mask; \
103 })
104
105#define physid_mask_of_physid(physid) \
106 ({ \
107 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
108 physid_set(physid, __physid_mask); \
109 __physid_mask; \
110 })
111
112#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
113#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
114
115extern physid_mask_t phys_cpu_present_map;
116
5#endif 117#endif
diff --git a/include/asm-x86/mpspec_32.h b/include/asm-x86/mpspec_32.h
deleted file mode 100644
index f21349399d14..000000000000
--- a/include/asm-x86/mpspec_32.h
+++ /dev/null
@@ -1,81 +0,0 @@
1#ifndef __ASM_MPSPEC_H
2#define __ASM_MPSPEC_H
3
4#include <linux/cpumask.h>
5#include <asm/mpspec_def.h>
6#include <mach_mpspec.h>
7
8extern int mp_bus_id_to_type [MAX_MP_BUSSES];
9extern int mp_bus_id_to_node [MAX_MP_BUSSES];
10extern int mp_bus_id_to_local [MAX_MP_BUSSES];
11extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
12extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
13
14extern unsigned int def_to_bigsmp;
15extern unsigned int boot_cpu_physical_apicid;
16extern int smp_found_config;
17extern void find_smp_config (void);
18extern void get_smp_config (void);
19extern int nr_ioapics;
20extern int apic_version [MAX_APICS];
21extern int mp_irq_entries;
22extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
23extern int mpc_default_type;
24extern unsigned long mp_lapic_addr;
25extern int pic_mode;
26
27#ifdef CONFIG_ACPI
28extern void mp_register_lapic (u8 id, u8 enabled);
29extern void mp_register_lapic_address (u64 address);
30extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
31extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
32extern void mp_config_acpi_legacy_irqs (void);
33extern int mp_register_gsi (u32 gsi, int edge_level, int active_high_low);
34#endif /* CONFIG_ACPI */
35
36#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
37
38struct physid_mask
39{
40 unsigned long mask[PHYSID_ARRAY_SIZE];
41};
42
43typedef struct physid_mask physid_mask_t;
44
45#define physid_set(physid, map) set_bit(physid, (map).mask)
46#define physid_clear(physid, map) clear_bit(physid, (map).mask)
47#define physid_isset(physid, map) test_bit(physid, (map).mask)
48#define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask)
49
50#define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
51#define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
52#define physids_clear(map) bitmap_zero((map).mask, MAX_APICS)
53#define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS)
54#define physids_empty(map) bitmap_empty((map).mask, MAX_APICS)
55#define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
56#define physids_weight(map) bitmap_weight((map).mask, MAX_APICS)
57#define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
58#define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
59#define physids_coerce(map) ((map).mask[0])
60
61#define physids_promote(physids) \
62 ({ \
63 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
64 __physid_mask.mask[0] = physids; \
65 __physid_mask; \
66 })
67
68#define physid_mask_of_physid(physid) \
69 ({ \
70 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
71 physid_set(physid, __physid_mask); \
72 __physid_mask; \
73 })
74
75#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
76#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
77
78extern physid_mask_t phys_cpu_present_map;
79
80#endif
81
diff --git a/include/asm-x86/mpspec_64.h b/include/asm-x86/mpspec_64.h
deleted file mode 100644
index 017fddb61dc5..000000000000
--- a/include/asm-x86/mpspec_64.h
+++ /dev/null
@@ -1,233 +0,0 @@
1#ifndef __ASM_MPSPEC_H
2#define __ASM_MPSPEC_H
3
4/*
5 * Structure definitions for SMP machines following the
6 * Intel Multiprocessing Specification 1.1 and 1.4.
7 */
8
9/*
10 * This tag identifies where the SMP configuration
11 * information is.
12 */
13
14#define SMP_MAGIC_IDENT (('_'<<24)|('P'<<16)|('M'<<8)|'_')
15
16/*
17 * A maximum of 255 APICs with the current APIC ID architecture.
18 */
19#define MAX_APICS 255
20
21struct intel_mp_floating
22{
23 char mpf_signature[4]; /* "_MP_" */
24 unsigned int mpf_physptr; /* Configuration table address */
25 unsigned char mpf_length; /* Our length (paragraphs) */
26 unsigned char mpf_specification;/* Specification version */
27 unsigned char mpf_checksum; /* Checksum (makes sum 0) */
28 unsigned char mpf_feature1; /* Standard or configuration ? */
29 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
30 unsigned char mpf_feature3; /* Unused (0) */
31 unsigned char mpf_feature4; /* Unused (0) */
32 unsigned char mpf_feature5; /* Unused (0) */
33};
34
35struct mp_config_table
36{
37 char mpc_signature[4];
38#define MPC_SIGNATURE "PCMP"
39 unsigned short mpc_length; /* Size of table */
40 char mpc_spec; /* 0x01 */
41 char mpc_checksum;
42 char mpc_oem[8];
43 char mpc_productid[12];
44 unsigned int mpc_oemptr; /* 0 if not present */
45 unsigned short mpc_oemsize; /* 0 if not present */
46 unsigned short mpc_oemcount;
47 unsigned int mpc_lapic; /* APIC address */
48 unsigned int reserved;
49};
50
51/* Followed by entries */
52
53#define MP_PROCESSOR 0
54#define MP_BUS 1
55#define MP_IOAPIC 2
56#define MP_INTSRC 3
57#define MP_LINTSRC 4
58
59struct mpc_config_processor
60{
61 unsigned char mpc_type;
62 unsigned char mpc_apicid; /* Local APIC number */
63 unsigned char mpc_apicver; /* Its versions */
64 unsigned char mpc_cpuflag;
65#define CPU_ENABLED 1 /* Processor is available */
66#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
67 unsigned int mpc_cpufeature;
68#define CPU_STEPPING_MASK 0x0F
69#define CPU_MODEL_MASK 0xF0
70#define CPU_FAMILY_MASK 0xF00
71 unsigned int mpc_featureflag; /* CPUID feature value */
72 unsigned int mpc_reserved[2];
73};
74
75struct mpc_config_bus
76{
77 unsigned char mpc_type;
78 unsigned char mpc_busid;
79 unsigned char mpc_bustype[6];
80};
81
82/* List of Bus Type string values, Intel MP Spec. */
83#define BUSTYPE_EISA "EISA"
84#define BUSTYPE_ISA "ISA"
85#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
86#define BUSTYPE_MCA "MCA"
87#define BUSTYPE_VL "VL" /* Local bus */
88#define BUSTYPE_PCI "PCI"
89#define BUSTYPE_PCMCIA "PCMCIA"
90#define BUSTYPE_CBUS "CBUS"
91#define BUSTYPE_CBUSII "CBUSII"
92#define BUSTYPE_FUTURE "FUTURE"
93#define BUSTYPE_MBI "MBI"
94#define BUSTYPE_MBII "MBII"
95#define BUSTYPE_MPI "MPI"
96#define BUSTYPE_MPSA "MPSA"
97#define BUSTYPE_NUBUS "NUBUS"
98#define BUSTYPE_TC "TC"
99#define BUSTYPE_VME "VME"
100#define BUSTYPE_XPRESS "XPRESS"
101
102struct mpc_config_ioapic
103{
104 unsigned char mpc_type;
105 unsigned char mpc_apicid;
106 unsigned char mpc_apicver;
107 unsigned char mpc_flags;
108#define MPC_APIC_USABLE 0x01
109 unsigned int mpc_apicaddr;
110};
111
112struct mpc_config_intsrc
113{
114 unsigned char mpc_type;
115 unsigned char mpc_irqtype;
116 unsigned short mpc_irqflag;
117 unsigned char mpc_srcbus;
118 unsigned char mpc_srcbusirq;
119 unsigned char mpc_dstapic;
120 unsigned char mpc_dstirq;
121};
122
123enum mp_irq_source_types {
124 mp_INT = 0,
125 mp_NMI = 1,
126 mp_SMI = 2,
127 mp_ExtINT = 3
128};
129
130#define MP_IRQDIR_DEFAULT 0
131#define MP_IRQDIR_HIGH 1
132#define MP_IRQDIR_LOW 3
133
134
135struct mpc_config_lintsrc
136{
137 unsigned char mpc_type;
138 unsigned char mpc_irqtype;
139 unsigned short mpc_irqflag;
140 unsigned char mpc_srcbusid;
141 unsigned char mpc_srcbusirq;
142 unsigned char mpc_destapic;
143#define MP_APIC_ALL 0xFF
144 unsigned char mpc_destapiclint;
145};
146
147/*
148 * Default configurations
149 *
150 * 1 2 CPU ISA 82489DX
151 * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
152 * 3 2 CPU EISA 82489DX
153 * 4 2 CPU MCA 82489DX
154 * 5 2 CPU ISA+PCI
155 * 6 2 CPU EISA+PCI
156 * 7 2 CPU MCA+PCI
157 */
158
159#define MAX_MP_BUSSES 256
160/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
161#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
162extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
163extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
164
165extern unsigned int boot_cpu_physical_apicid;
166extern int smp_found_config;
167extern void find_smp_config (void);
168extern void get_smp_config (void);
169extern int nr_ioapics;
170extern unsigned char apic_version [MAX_APICS];
171extern int mp_irq_entries;
172extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
173extern int mpc_default_type;
174extern unsigned long mp_lapic_addr;
175
176#ifdef CONFIG_ACPI
177extern void mp_register_lapic (u8 id, u8 enabled);
178extern void mp_register_lapic_address (u64 address);
179
180extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
181extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
182extern void mp_config_acpi_legacy_irqs (void);
183extern int mp_register_gsi (u32 gsi, int triggering, int polarity);
184#endif
185
186extern int using_apic_timer;
187
188#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
189
190struct physid_mask
191{
192 unsigned long mask[PHYSID_ARRAY_SIZE];
193};
194
195typedef struct physid_mask physid_mask_t;
196
197#define physid_set(physid, map) set_bit(physid, (map).mask)
198#define physid_clear(physid, map) clear_bit(physid, (map).mask)
199#define physid_isset(physid, map) test_bit(physid, (map).mask)
200#define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask)
201
202#define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
203#define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
204#define physids_clear(map) bitmap_zero((map).mask, MAX_APICS)
205#define physids_complement(dst, src) bitmap_complement((dst).mask, (src).mask, MAX_APICS)
206#define physids_empty(map) bitmap_empty((map).mask, MAX_APICS)
207#define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
208#define physids_weight(map) bitmap_weight((map).mask, MAX_APICS)
209#define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
210#define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
211#define physids_coerce(map) ((map).mask[0])
212
213#define physids_promote(physids) \
214 ({ \
215 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
216 __physid_mask.mask[0] = physids; \
217 __physid_mask; \
218 })
219
220#define physid_mask_of_physid(physid) \
221 ({ \
222 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
223 physid_set(physid, __physid_mask); \
224 __physid_mask; \
225 })
226
227#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
228#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
229
230extern physid_mask_t phys_cpu_present_map;
231
232#endif
233
diff --git a/include/asm-x86/mpspec_def.h b/include/asm-x86/mpspec_def.h
index 13bafb16e7af..3504617fe648 100644
--- a/include/asm-x86/mpspec_def.h
+++ b/include/asm-x86/mpspec_def.h
@@ -8,52 +8,68 @@
8 8
9/* 9/*
10 * This tag identifies where the SMP configuration 10 * This tag identifies where the SMP configuration
11 * information is. 11 * information is.
12 */ 12 */
13 13
14#define SMP_MAGIC_IDENT (('_'<<24)|('P'<<16)|('M'<<8)|'_') 14#define SMP_MAGIC_IDENT (('_'<<24)|('P'<<16)|('M'<<8)|'_')
15 15
16#define MAX_MPC_ENTRY 1024 16#ifdef CONFIG_X86_32
17#define MAX_APICS 256 17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20/*
21 * A maximum of 255 APICs with the current APIC ID architecture.
22 */
23# define MAX_APICS 255
24#endif
18 25
19struct intel_mp_floating 26struct intel_mp_floating
20{ 27{
21 char mpf_signature[4]; /* "_MP_" */ 28 char mpf_signature[4]; /* "_MP_" */
22 unsigned long mpf_physptr; /* Configuration table address */ 29 unsigned int mpf_physptr; /* Configuration table address */
23 unsigned char mpf_length; /* Our length (paragraphs) */ 30 unsigned char mpf_length; /* Our length (paragraphs) */
24 unsigned char mpf_specification;/* Specification version */ 31 unsigned char mpf_specification;/* Specification version */
25 unsigned char mpf_checksum; /* Checksum (makes sum 0) */ 32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */
26 unsigned char mpf_feature1; /* Standard or configuration ? */ 33 unsigned char mpf_feature1; /* Standard or configuration ? */
27 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ 34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
28 unsigned char mpf_feature3; /* Unused (0) */ 35 unsigned char mpf_feature3; /* Unused (0) */
29 unsigned char mpf_feature4; /* Unused (0) */ 36 unsigned char mpf_feature4; /* Unused (0) */
30 unsigned char mpf_feature5; /* Unused (0) */ 37 unsigned char mpf_feature5; /* Unused (0) */
31}; 38};
32 39
40#define MPC_SIGNATURE "PCMP"
41
33struct mp_config_table 42struct mp_config_table
34{ 43{
35 char mpc_signature[4]; 44 char mpc_signature[4];
36#define MPC_SIGNATURE "PCMP"
37 unsigned short mpc_length; /* Size of table */ 45 unsigned short mpc_length; /* Size of table */
38 char mpc_spec; /* 0x01 */ 46 char mpc_spec; /* 0x01 */
39 char mpc_checksum; 47 char mpc_checksum;
40 char mpc_oem[8]; 48 char mpc_oem[8];
41 char mpc_productid[12]; 49 char mpc_productid[12];
42 unsigned long mpc_oemptr; /* 0 if not present */ 50 unsigned int mpc_oemptr; /* 0 if not present */
43 unsigned short mpc_oemsize; /* 0 if not present */ 51 unsigned short mpc_oemsize; /* 0 if not present */
44 unsigned short mpc_oemcount; 52 unsigned short mpc_oemcount;
45 unsigned long mpc_lapic; /* APIC address */ 53 unsigned int mpc_lapic; /* APIC address */
46 unsigned long reserved; 54 unsigned int reserved;
47}; 55};
48 56
49/* Followed by entries */ 57/* Followed by entries */
50 58
51#define MP_PROCESSOR 0 59#define MP_PROCESSOR 0
52#define MP_BUS 1 60#define MP_BUS 1
53#define MP_IOAPIC 2 61#define MP_IOAPIC 2
54#define MP_INTSRC 3 62#define MP_INTSRC 3
55#define MP_LINTSRC 4 63#define MP_LINTSRC 4
56#define MP_TRANSLATION 192 /* Used by IBM NUMA-Q to describe node locality */ 64/* Used by IBM NUMA-Q to describe node locality */
65#define MP_TRANSLATION 192
66
67#define CPU_ENABLED 1 /* Processor is available */
68#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
69
70#define CPU_STEPPING_MASK 0x000F
71#define CPU_MODEL_MASK 0x00F0
72#define CPU_FAMILY_MASK 0x0F00
57 73
58struct mpc_config_processor 74struct mpc_config_processor
59{ 75{
@@ -61,14 +77,9 @@ struct mpc_config_processor
61 unsigned char mpc_apicid; /* Local APIC number */ 77 unsigned char mpc_apicid; /* Local APIC number */
62 unsigned char mpc_apicver; /* Its versions */ 78 unsigned char mpc_apicver; /* Its versions */
63 unsigned char mpc_cpuflag; 79 unsigned char mpc_cpuflag;
64#define CPU_ENABLED 1 /* Processor is available */ 80 unsigned int mpc_cpufeature;
65#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */ 81 unsigned int mpc_featureflag; /* CPUID feature value */
66 unsigned long mpc_cpufeature; 82 unsigned int mpc_reserved[2];
67#define CPU_STEPPING_MASK 0x0F
68#define CPU_MODEL_MASK 0xF0
69#define CPU_FAMILY_MASK 0xF00
70 unsigned long mpc_featureflag; /* CPUID feature value */
71 unsigned long mpc_reserved[2];
72}; 83};
73 84
74struct mpc_config_bus 85struct mpc_config_bus
@@ -98,14 +109,15 @@ struct mpc_config_bus
98#define BUSTYPE_VME "VME" 109#define BUSTYPE_VME "VME"
99#define BUSTYPE_XPRESS "XPRESS" 110#define BUSTYPE_XPRESS "XPRESS"
100 111
112#define MPC_APIC_USABLE 0x01
113
101struct mpc_config_ioapic 114struct mpc_config_ioapic
102{ 115{
103 unsigned char mpc_type; 116 unsigned char mpc_type;
104 unsigned char mpc_apicid; 117 unsigned char mpc_apicid;
105 unsigned char mpc_apicver; 118 unsigned char mpc_apicver;
106 unsigned char mpc_flags; 119 unsigned char mpc_flags;
107#define MPC_APIC_USABLE 0x01 120 unsigned int mpc_apicaddr;
108 unsigned long mpc_apicaddr;
109}; 121};
110 122
111struct mpc_config_intsrc 123struct mpc_config_intsrc
@@ -130,6 +142,7 @@ enum mp_irq_source_types {
130#define MP_IRQDIR_HIGH 1 142#define MP_IRQDIR_HIGH 1
131#define MP_IRQDIR_LOW 3 143#define MP_IRQDIR_LOW 3
132 144
145#define MP_APIC_ALL 0xFF
133 146
134struct mpc_config_lintsrc 147struct mpc_config_lintsrc
135{ 148{
@@ -138,15 +151,15 @@ struct mpc_config_lintsrc
138 unsigned short mpc_irqflag; 151 unsigned short mpc_irqflag;
139 unsigned char mpc_srcbusid; 152 unsigned char mpc_srcbusid;
140 unsigned char mpc_srcbusirq; 153 unsigned char mpc_srcbusirq;
141 unsigned char mpc_destapic; 154 unsigned char mpc_destapic;
142#define MP_APIC_ALL 0xFF
143 unsigned char mpc_destapiclint; 155 unsigned char mpc_destapiclint;
144}; 156};
145 157
158#define MPC_OEM_SIGNATURE "_OEM"
159
146struct mp_config_oemtable 160struct mp_config_oemtable
147{ 161{
148 char oem_signature[4]; 162 char oem_signature[4];
149#define MPC_OEM_SIGNATURE "_OEM"
150 unsigned short oem_length; /* Size of table */ 163 unsigned short oem_length; /* Size of table */
151 char oem_rev; /* 0x01 */ 164 char oem_rev; /* 0x01 */
152 char oem_checksum; 165 char oem_checksum;
@@ -155,13 +168,13 @@ struct mp_config_oemtable
155 168
156struct mpc_config_translation 169struct mpc_config_translation
157{ 170{
158 unsigned char mpc_type; 171 unsigned char mpc_type;
159 unsigned char trans_len; 172 unsigned char trans_len;
160 unsigned char trans_type; 173 unsigned char trans_type;
161 unsigned char trans_quad; 174 unsigned char trans_quad;
162 unsigned char trans_global; 175 unsigned char trans_global;
163 unsigned char trans_local; 176 unsigned char trans_local;
164 unsigned short trans_reserved; 177 unsigned short trans_reserved;
165}; 178};
166 179
167/* 180/*
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index a4944732be04..fae118a25278 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -63,6 +63,13 @@
63#define MSR_IA32_LASTINTFROMIP 0x000001dd 63#define MSR_IA32_LASTINTFROMIP 0x000001dd
64#define MSR_IA32_LASTINTTOIP 0x000001de 64#define MSR_IA32_LASTINTTOIP 0x000001de
65 65
66/* DEBUGCTLMSR bits (others vary by model): */
67#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
68#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
69
70#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
71#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
72
66#define MSR_IA32_MC0_CTL 0x00000400 73#define MSR_IA32_MC0_CTL 0x00000400
67#define MSR_IA32_MC0_STATUS 0x00000401 74#define MSR_IA32_MC0_STATUS 0x00000401
68#define MSR_IA32_MC0_ADDR 0x00000402 75#define MSR_IA32_MC0_ADDR 0x00000402
@@ -88,6 +95,14 @@
88#define MSR_AMD64_IBSDCPHYSAD 0xc0011039 95#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
89#define MSR_AMD64_IBSCTL 0xc001103a 96#define MSR_AMD64_IBSCTL 0xc001103a
90 97
98/* Fam 10h MSRs */
99#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
100#define FAM10H_MMIO_CONF_ENABLE (1<<0)
101#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
102#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
103#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
104#define FAM10H_MMIO_CONF_BASE_SHIFT 20
105
91/* K8 MSRs */ 106/* K8 MSRs */
92#define MSR_K8_TOP_MEM1 0xc001001a 107#define MSR_K8_TOP_MEM1 0xc001001a
93#define MSR_K8_TOP_MEM2 0xc001001d 108#define MSR_K8_TOP_MEM2 0xc001001d
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index 80b027081b3c..204a8a30fecf 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -7,77 +7,109 @@
7# include <linux/types.h> 7# include <linux/types.h>
8#endif 8#endif
9 9
10#ifdef __i386__
11
12#ifdef __KERNEL__ 10#ifdef __KERNEL__
13#ifndef __ASSEMBLY__ 11#ifndef __ASSEMBLY__
14 12
13#include <asm/asm.h>
15#include <asm/errno.h> 14#include <asm/errno.h>
16 15
16static inline unsigned long long native_read_tscp(unsigned int *aux)
17{
18 unsigned long low, high;
19 asm volatile (".byte 0x0f,0x01,0xf9"
20 : "=a" (low), "=d" (high), "=c" (*aux));
21 return low | ((u64)high >> 32);
22}
23
24/*
25 * i386 calling convention returns 64-bit value in edx:eax, while
26 * x86_64 returns at rax. Also, the "A" constraint does not really
27 * mean rdx:rax in x86_64, so we need specialized behaviour for each
28 * architecture
29 */
30#ifdef CONFIG_X86_64
31#define DECLARE_ARGS(val, low, high) unsigned low, high
32#define EAX_EDX_VAL(val, low, high) (low | ((u64)(high) << 32))
33#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
34#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
35#else
36#define DECLARE_ARGS(val, low, high) unsigned long long val
37#define EAX_EDX_VAL(val, low, high) (val)
38#define EAX_EDX_ARGS(val, low, high) "A" (val)
39#define EAX_EDX_RET(val, low, high) "=A" (val)
40#endif
41
17static inline unsigned long long native_read_msr(unsigned int msr) 42static inline unsigned long long native_read_msr(unsigned int msr)
18{ 43{
19 unsigned long long val; 44 DECLARE_ARGS(val, low, high);
20 45
21 asm volatile("rdmsr" : "=A" (val) : "c" (msr)); 46 asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
22 return val; 47 return EAX_EDX_VAL(val, low, high);
23} 48}
24 49
25static inline unsigned long long native_read_msr_safe(unsigned int msr, 50static inline unsigned long long native_read_msr_safe(unsigned int msr,
26 int *err) 51 int *err)
27{ 52{
28 unsigned long long val; 53 DECLARE_ARGS(val, low, high);
29 54
30 asm volatile("2: rdmsr ; xorl %0,%0\n" 55 asm volatile("2: rdmsr ; xor %0,%0\n"
31 "1:\n\t" 56 "1:\n\t"
32 ".section .fixup,\"ax\"\n\t" 57 ".section .fixup,\"ax\"\n\t"
33 "3: movl %3,%0 ; jmp 1b\n\t" 58 "3: mov %3,%0 ; jmp 1b\n\t"
34 ".previous\n\t" 59 ".previous\n\t"
35 ".section __ex_table,\"a\"\n" 60 ".section __ex_table,\"a\"\n"
36 " .align 4\n\t" 61 _ASM_ALIGN "\n\t"
37 " .long 2b,3b\n\t" 62 _ASM_PTR " 2b,3b\n\t"
38 ".previous" 63 ".previous"
39 : "=r" (*err), "=A" (val) 64 : "=r" (*err), EAX_EDX_RET(val, low, high)
40 : "c" (msr), "i" (-EFAULT)); 65 : "c" (msr), "i" (-EFAULT));
41 66 return EAX_EDX_VAL(val, low, high);
42 return val;
43} 67}
44 68
45static inline void native_write_msr(unsigned int msr, unsigned long long val) 69static inline void native_write_msr(unsigned int msr,
70 unsigned low, unsigned high)
46{ 71{
47 asm volatile("wrmsr" : : "c" (msr), "A"(val)); 72 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high));
48} 73}
49 74
50static inline int native_write_msr_safe(unsigned int msr, 75static inline int native_write_msr_safe(unsigned int msr,
51 unsigned long long val) 76 unsigned low, unsigned high)
52{ 77{
53 int err; 78 int err;
54 asm volatile("2: wrmsr ; xorl %0,%0\n" 79 asm volatile("2: wrmsr ; xor %0,%0\n"
55 "1:\n\t" 80 "1:\n\t"
56 ".section .fixup,\"ax\"\n\t" 81 ".section .fixup,\"ax\"\n\t"
57 "3: movl %4,%0 ; jmp 1b\n\t" 82 "3: mov %4,%0 ; jmp 1b\n\t"
58 ".previous\n\t" 83 ".previous\n\t"
59 ".section __ex_table,\"a\"\n" 84 ".section __ex_table,\"a\"\n"
60 " .align 4\n\t" 85 _ASM_ALIGN "\n\t"
61 " .long 2b,3b\n\t" 86 _ASM_PTR " 2b,3b\n\t"
62 ".previous" 87 ".previous"
63 : "=a" (err) 88 : "=a" (err)
64 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), 89 : "c" (msr), "0" (low), "d" (high),
65 "i" (-EFAULT)); 90 "i" (-EFAULT));
66 return err; 91 return err;
67} 92}
68 93
69static inline unsigned long long native_read_tsc(void) 94extern unsigned long long native_read_tsc(void);
95
96static __always_inline unsigned long long __native_read_tsc(void)
70{ 97{
71 unsigned long long val; 98 DECLARE_ARGS(val, low, high);
72 asm volatile("rdtsc" : "=A" (val)); 99
73 return val; 100 rdtsc_barrier();
101 asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
102 rdtsc_barrier();
103
104 return EAX_EDX_VAL(val, low, high);
74} 105}
75 106
76static inline unsigned long long native_read_pmc(void) 107static inline unsigned long long native_read_pmc(int counter)
77{ 108{
78 unsigned long long val; 109 DECLARE_ARGS(val, low, high);
79 asm volatile("rdpmc" : "=A" (val)); 110
80 return val; 111 asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
112 return EAX_EDX_VAL(val, low, high);
81} 113}
82 114
83#ifdef CONFIG_PARAVIRT 115#ifdef CONFIG_PARAVIRT
@@ -97,20 +129,21 @@ static inline unsigned long long native_read_pmc(void)
97 (val2) = (u32)(__val >> 32); \ 129 (val2) = (u32)(__val >> 32); \
98 } while(0) 130 } while(0)
99 131
100static inline void wrmsr(u32 __msr, u32 __low, u32 __high) 132static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
101{ 133{
102 native_write_msr(__msr, ((u64)__high << 32) | __low); 134 native_write_msr(msr, low, high);
103} 135}
104 136
105#define rdmsrl(msr,val) \ 137#define rdmsrl(msr,val) \
106 ((val) = native_read_msr(msr)) 138 ((val) = native_read_msr(msr))
107 139
108#define wrmsrl(msr,val) native_write_msr(msr, val) 140#define wrmsrl(msr, val) \
141 native_write_msr(msr, (u32)((u64)(val)), (u32)((u64)(val) >> 32))
109 142
110/* wrmsr with exception handling */ 143/* wrmsr with exception handling */
111static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high) 144static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
112{ 145{
113 return native_write_msr_safe(__msr, ((u64)__high << 32) | __low); 146 return native_write_msr_safe(msr, low, high);
114} 147}
115 148
116/* rdmsr with exception handling */ 149/* rdmsr with exception handling */
@@ -129,204 +162,31 @@ static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
129#define rdtscll(val) \ 162#define rdtscll(val) \
130 ((val) = native_read_tsc()) 163 ((val) = native_read_tsc())
131 164
132#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
133
134#define rdpmc(counter,low,high) \ 165#define rdpmc(counter,low,high) \
135 do { \ 166 do { \
136 u64 _l = native_read_pmc(); \ 167 u64 _l = native_read_pmc(counter); \
137 (low) = (u32)_l; \ 168 (low) = (u32)_l; \
138 (high) = (u32)(_l >> 32); \ 169 (high) = (u32)(_l >> 32); \
139 } while(0) 170 } while(0)
140#endif /* !CONFIG_PARAVIRT */
141
142#ifdef CONFIG_SMP
143void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
144void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
145int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
146int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
147#else /* CONFIG_SMP */
148static inline void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
149{
150 rdmsr(msr_no, *l, *h);
151}
152static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
153{
154 wrmsr(msr_no, l, h);
155}
156static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
157{
158 return rdmsr_safe(msr_no, l, h);
159}
160static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
161{
162 return wrmsr_safe(msr_no, l, h);
163}
164#endif /* CONFIG_SMP */
165#endif /* ! __ASSEMBLY__ */
166#endif /* __KERNEL__ */
167
168#else /* __i386__ */
169
170#ifndef __ASSEMBLY__
171#include <linux/errno.h>
172/*
173 * Access to machine-specific registers (available on 586 and better only)
174 * Note: the rd* operations modify the parameters directly (without using
175 * pointer indirection), this allows gcc to optimize better
176 */
177
178#define rdmsr(msr,val1,val2) \
179 __asm__ __volatile__("rdmsr" \
180 : "=a" (val1), "=d" (val2) \
181 : "c" (msr))
182
183
184#define rdmsrl(msr,val) do { unsigned long a__,b__; \
185 __asm__ __volatile__("rdmsr" \
186 : "=a" (a__), "=d" (b__) \
187 : "c" (msr)); \
188 val = a__ | (b__<<32); \
189} while(0)
190
191#define wrmsr(msr,val1,val2) \
192 __asm__ __volatile__("wrmsr" \
193 : /* no outputs */ \
194 : "c" (msr), "a" (val1), "d" (val2))
195
196#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32)
197 171
198#define rdtsc(low,high) \ 172#define rdtscp(low, high, aux) \
199 __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) 173 do { \
174 unsigned long long _val = native_read_tscp(&(aux)); \
175 (low) = (u32)_val; \
176 (high) = (u32)(_val >> 32); \
177 } while (0)
200 178
201#define rdtscl(low) \ 179#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
202 __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
203 180
204#define rdtscp(low,high,aux) \ 181#endif /* !CONFIG_PARAVIRT */
205 __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
206 182
207#define rdtscll(val) do { \
208 unsigned int __a,__d; \
209 __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
210 (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
211} while(0)
212 183
213#define rdtscpll(val, aux) do { \ 184#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
214 unsigned long __a, __d; \
215 __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
216 (val) = (__d << 32) | __a; \
217} while (0)
218 185
219#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) 186#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
220 187
221#define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0) 188#define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0)
222 189
223#define rdpmc(counter,low,high) \
224 __asm__ __volatile__("rdpmc" \
225 : "=a" (low), "=d" (high) \
226 : "c" (counter))
227
228
229static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
230 unsigned int *ecx, unsigned int *edx)
231{
232 __asm__("cpuid"
233 : "=a" (*eax),
234 "=b" (*ebx),
235 "=c" (*ecx),
236 "=d" (*edx)
237 : "0" (op));
238}
239
240/* Some CPUID calls want 'count' to be placed in ecx */
241static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
242 int *edx)
243{
244 __asm__("cpuid"
245 : "=a" (*eax),
246 "=b" (*ebx),
247 "=c" (*ecx),
248 "=d" (*edx)
249 : "0" (op), "c" (count));
250}
251
252/*
253 * CPUID functions returning a single datum
254 */
255static inline unsigned int cpuid_eax(unsigned int op)
256{
257 unsigned int eax;
258
259 __asm__("cpuid"
260 : "=a" (eax)
261 : "0" (op)
262 : "bx", "cx", "dx");
263 return eax;
264}
265static inline unsigned int cpuid_ebx(unsigned int op)
266{
267 unsigned int eax, ebx;
268
269 __asm__("cpuid"
270 : "=a" (eax), "=b" (ebx)
271 : "0" (op)
272 : "cx", "dx" );
273 return ebx;
274}
275static inline unsigned int cpuid_ecx(unsigned int op)
276{
277 unsigned int eax, ecx;
278
279 __asm__("cpuid"
280 : "=a" (eax), "=c" (ecx)
281 : "0" (op)
282 : "bx", "dx" );
283 return ecx;
284}
285static inline unsigned int cpuid_edx(unsigned int op)
286{
287 unsigned int eax, edx;
288
289 __asm__("cpuid"
290 : "=a" (eax), "=d" (edx)
291 : "0" (op)
292 : "bx", "cx");
293 return edx;
294}
295
296#ifdef __KERNEL__
297
298/* wrmsr with exception handling */
299#define wrmsr_safe(msr,a,b) ({ int ret__; \
300 asm volatile("2: wrmsr ; xorl %0,%0\n" \
301 "1:\n\t" \
302 ".section .fixup,\"ax\"\n\t" \
303 "3: movl %4,%0 ; jmp 1b\n\t" \
304 ".previous\n\t" \
305 ".section __ex_table,\"a\"\n" \
306 " .align 8\n\t" \
307 " .quad 2b,3b\n\t" \
308 ".previous" \
309 : "=a" (ret__) \
310 : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
311 ret__; })
312
313#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
314
315#define rdmsr_safe(msr,a,b) \
316 ({ int ret__; \
317 asm volatile ("1: rdmsr\n" \
318 "2:\n" \
319 ".section .fixup,\"ax\"\n" \
320 "3: movl %4,%0\n" \
321 " jmp 2b\n" \
322 ".previous\n" \
323 ".section __ex_table,\"a\"\n" \
324 " .align 8\n" \
325 " .quad 1b,3b\n" \
326 ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b)) \
327 :"c"(msr), "i"(-EIO), "0"(0)); \
328 ret__; })
329
330#ifdef CONFIG_SMP 190#ifdef CONFIG_SMP
331void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 191void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
332void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 192void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
@@ -350,9 +210,8 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
350 return wrmsr_safe(msr_no, l, h); 210 return wrmsr_safe(msr_no, l, h);
351} 211}
352#endif /* CONFIG_SMP */ 212#endif /* CONFIG_SMP */
353#endif /* __KERNEL__ */ 213#endif /* __ASSEMBLY__ */
354#endif /* __ASSEMBLY__ */ 214#endif /* __KERNEL__ */
355 215
356#endif /* !__i386__ */
357 216
358#endif 217#endif
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h
index e8320e4e6ca2..319d065800be 100644
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -89,24 +89,25 @@ struct mtrr_gentry
89extern void mtrr_save_fixed_ranges(void *); 89extern void mtrr_save_fixed_ranges(void *);
90extern void mtrr_save_state(void); 90extern void mtrr_save_state(void);
91extern int mtrr_add (unsigned long base, unsigned long size, 91extern int mtrr_add (unsigned long base, unsigned long size,
92 unsigned int type, char increment); 92 unsigned int type, bool increment);
93extern int mtrr_add_page (unsigned long base, unsigned long size, 93extern int mtrr_add_page (unsigned long base, unsigned long size,
94 unsigned int type, char increment); 94 unsigned int type, bool increment);
95extern int mtrr_del (int reg, unsigned long base, unsigned long size); 95extern int mtrr_del (int reg, unsigned long base, unsigned long size);
96extern int mtrr_del_page (int reg, unsigned long base, unsigned long size); 96extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
97extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); 97extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
98extern void mtrr_ap_init(void); 98extern void mtrr_ap_init(void);
99extern void mtrr_bp_init(void); 99extern void mtrr_bp_init(void);
100extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
100# else 101# else
101#define mtrr_save_fixed_ranges(arg) do {} while (0) 102#define mtrr_save_fixed_ranges(arg) do {} while (0)
102#define mtrr_save_state() do {} while (0) 103#define mtrr_save_state() do {} while (0)
103static __inline__ int mtrr_add (unsigned long base, unsigned long size, 104static __inline__ int mtrr_add (unsigned long base, unsigned long size,
104 unsigned int type, char increment) 105 unsigned int type, bool increment)
105{ 106{
106 return -ENODEV; 107 return -ENODEV;
107} 108}
108static __inline__ int mtrr_add_page (unsigned long base, unsigned long size, 109static __inline__ int mtrr_add_page (unsigned long base, unsigned long size,
109 unsigned int type, char increment) 110 unsigned int type, bool increment)
110{ 111{
111 return -ENODEV; 112 return -ENODEV;
112} 113}
@@ -120,7 +121,10 @@ static __inline__ int mtrr_del_page (int reg, unsigned long base,
120{ 121{
121 return -ENODEV; 122 return -ENODEV;
122} 123}
123 124static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
125{
126 return 0;
127}
124static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;} 128static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}
125 129
126#define mtrr_ap_init() do {} while (0) 130#define mtrr_ap_init() do {} while (0)
diff --git a/include/asm-x86/mutex_32.h b/include/asm-x86/mutex_32.h
index 7a17d9e58ad6..bbeefb96ddfd 100644
--- a/include/asm-x86/mutex_32.h
+++ b/include/asm-x86/mutex_32.h
@@ -26,7 +26,7 @@ do { \
26 unsigned int dummy; \ 26 unsigned int dummy; \
27 \ 27 \
28 typecheck(atomic_t *, count); \ 28 typecheck(atomic_t *, count); \
29 typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ 29 typecheck_fn(void (*)(atomic_t *), fail_fn); \
30 \ 30 \
31 __asm__ __volatile__( \ 31 __asm__ __volatile__( \
32 LOCK_PREFIX " decl (%%eax) \n" \ 32 LOCK_PREFIX " decl (%%eax) \n" \
@@ -51,8 +51,7 @@ do { \
51 * or anything the slow path function returns 51 * or anything the slow path function returns
52 */ 52 */
53static inline int 53static inline int
54__mutex_fastpath_lock_retval(atomic_t *count, 54__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
55 int fastcall (*fail_fn)(atomic_t *))
56{ 55{
57 if (unlikely(atomic_dec_return(count) < 0)) 56 if (unlikely(atomic_dec_return(count) < 0))
58 return fail_fn(count); 57 return fail_fn(count);
@@ -78,7 +77,7 @@ do { \
78 unsigned int dummy; \ 77 unsigned int dummy; \
79 \ 78 \
80 typecheck(atomic_t *, count); \ 79 typecheck(atomic_t *, count); \
81 typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ 80 typecheck_fn(void (*)(atomic_t *), fail_fn); \
82 \ 81 \
83 __asm__ __volatile__( \ 82 __asm__ __volatile__( \
84 LOCK_PREFIX " incl (%%eax) \n" \ 83 LOCK_PREFIX " incl (%%eax) \n" \
diff --git a/include/asm-x86/nmi_32.h b/include/asm-x86/nmi_32.h
index 70a958a8e381..7206c7e8a388 100644
--- a/include/asm-x86/nmi_32.h
+++ b/include/asm-x86/nmi_32.h
@@ -1,6 +1,3 @@
1/*
2 * linux/include/asm-i386/nmi.h
3 */
4#ifndef ASM_NMI_H 1#ifndef ASM_NMI_H
5#define ASM_NMI_H 2#define ASM_NMI_H
6 3
diff --git a/include/asm-x86/nmi_64.h b/include/asm-x86/nmi_64.h
index 65b6acf3bb59..2eeb74e5f3ff 100644
--- a/include/asm-x86/nmi_64.h
+++ b/include/asm-x86/nmi_64.h
@@ -1,6 +1,3 @@
1/*
2 * linux/include/asm-i386/nmi.h
3 */
4#ifndef ASM_NMI_H 1#ifndef ASM_NMI_H
5#define ASM_NMI_H 2#define ASM_NMI_H
6 3
@@ -41,7 +38,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
41 38
42#define get_nmi_reason() inb(0x61) 39#define get_nmi_reason() inb(0x61)
43 40
44extern int panic_on_timeout;
45extern int unknown_nmi_panic; 41extern int unknown_nmi_panic;
46extern int nmi_watchdog_enabled; 42extern int nmi_watchdog_enabled;
47 43
@@ -60,7 +56,6 @@ extern void enable_timer_nmi_watchdog(void);
60extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); 56extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
61 57
62extern void nmi_watchdog_default(void); 58extern void nmi_watchdog_default(void);
63extern int setup_nmi_watchdog(char *);
64 59
65extern atomic_t nmi_active; 60extern atomic_t nmi_active;
66extern unsigned int nmi_watchdog; 61extern unsigned int nmi_watchdog;
diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h
new file mode 100644
index 000000000000..fec025c7f58c
--- /dev/null
+++ b/include/asm-x86/nops.h
@@ -0,0 +1,90 @@
1#ifndef _ASM_NOPS_H
2#define _ASM_NOPS_H 1
3
4/* Define nops for use with alternative() */
5
6/* generic versions from gas */
7#define GENERIC_NOP1 ".byte 0x90\n"
8#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
9#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
10#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
11#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
12#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
13#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
14#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
15
16/* Opteron 64bit nops */
17#define K8_NOP1 GENERIC_NOP1
18#define K8_NOP2 ".byte 0x66,0x90\n"
19#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
20#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
21#define K8_NOP5 K8_NOP3 K8_NOP2
22#define K8_NOP6 K8_NOP3 K8_NOP3
23#define K8_NOP7 K8_NOP4 K8_NOP3
24#define K8_NOP8 K8_NOP4 K8_NOP4
25
26/* K7 nops */
27/* uses eax dependencies (arbitary choice) */
28#define K7_NOP1 GENERIC_NOP1
29#define K7_NOP2 ".byte 0x8b,0xc0\n"
30#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
31#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
32#define K7_NOP5 K7_NOP4 ASM_NOP1
33#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
34#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
35#define K7_NOP8 K7_NOP7 ASM_NOP1
36
37/* P6 nops */
38/* uses eax dependencies (Intel-recommended choice) */
39#define P6_NOP1 GENERIC_NOP1
40#define P6_NOP2 ".byte 0x66,0x90\n"
41#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
42#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
43#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
44#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
45#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
46#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
47
48#if defined(CONFIG_MK8)
49#define ASM_NOP1 K8_NOP1
50#define ASM_NOP2 K8_NOP2
51#define ASM_NOP3 K8_NOP3
52#define ASM_NOP4 K8_NOP4
53#define ASM_NOP5 K8_NOP5
54#define ASM_NOP6 K8_NOP6
55#define ASM_NOP7 K8_NOP7
56#define ASM_NOP8 K8_NOP8
57#elif defined(CONFIG_MK7)
58#define ASM_NOP1 K7_NOP1
59#define ASM_NOP2 K7_NOP2
60#define ASM_NOP3 K7_NOP3
61#define ASM_NOP4 K7_NOP4
62#define ASM_NOP5 K7_NOP5
63#define ASM_NOP6 K7_NOP6
64#define ASM_NOP7 K7_NOP7
65#define ASM_NOP8 K7_NOP8
66#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
67 defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
68 defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
69#define ASM_NOP1 P6_NOP1
70#define ASM_NOP2 P6_NOP2
71#define ASM_NOP3 P6_NOP3
72#define ASM_NOP4 P6_NOP4
73#define ASM_NOP5 P6_NOP5
74#define ASM_NOP6 P6_NOP6
75#define ASM_NOP7 P6_NOP7
76#define ASM_NOP8 P6_NOP8
77#else
78#define ASM_NOP1 GENERIC_NOP1
79#define ASM_NOP2 GENERIC_NOP2
80#define ASM_NOP3 GENERIC_NOP3
81#define ASM_NOP4 GENERIC_NOP4
82#define ASM_NOP5 GENERIC_NOP5
83#define ASM_NOP6 GENERIC_NOP6
84#define ASM_NOP7 GENERIC_NOP7
85#define ASM_NOP8 GENERIC_NOP8
86#endif
87
88#define ASM_NOP_MAX 8
89
90#endif
diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h
index 96fcb157db1d..03d0f7a9bf02 100644
--- a/include/asm-x86/numa_32.h
+++ b/include/asm-x86/numa_32.h
@@ -1,3 +1,15 @@
1#ifndef _ASM_X86_32_NUMA_H
2#define _ASM_X86_32_NUMA_H 1
1 3
2int pxm_to_nid(int pxm); 4extern int pxm_to_nid(int pxm);
3 5
6#ifdef CONFIG_NUMA
7extern void __init remap_numa_kva(void);
8extern void set_highmem_pages_init(int);
9#else
10static inline void remap_numa_kva(void)
11{
12}
13#endif
14
15#endif /* _ASM_X86_32_NUMA_H */
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 0cc5c97a7fc9..15fe07cde586 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -20,13 +20,19 @@ extern void numa_set_node(int cpu, int node);
20extern void srat_reserve_add_area(int nodeid); 20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent; 21extern int hotadd_percent;
22 22
23extern unsigned char apicid_to_node[MAX_LOCAL_APIC]; 23extern s16 apicid_to_node[MAX_LOCAL_APIC];
24
25extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
26extern unsigned long numa_free_all_bootmem(void);
27extern void setup_node_bootmem(int nodeid, unsigned long start,
28 unsigned long end);
29
24#ifdef CONFIG_NUMA 30#ifdef CONFIG_NUMA
25extern void __init init_cpu_to_node(void); 31extern void __init init_cpu_to_node(void);
26 32
27static inline void clear_node_cpumask(int cpu) 33static inline void clear_node_cpumask(int cpu)
28{ 34{
29 clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 35 clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
30} 36}
31 37
32#else 38#else
@@ -34,6 +40,4 @@ static inline void clear_node_cpumask(int cpu)
34#define clear_node_cpumask(cpu) do {} while (0) 40#define clear_node_cpumask(cpu) do {} while (0)
35#endif 41#endif
36 42
37#define NUMA_NO_NODE 0xff
38
39#endif 43#endif
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index a757eb26141d..c8b30efeed85 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -1,13 +1,183 @@
1#ifndef _ASM_X86_PAGE_H
2#define _ASM_X86_PAGE_H
3
4#include <linux/const.h>
5
6/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10
1#ifdef __KERNEL__ 11#ifdef __KERNEL__
2# ifdef CONFIG_X86_32 12
3# include "page_32.h" 13#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK)
4# else 14#define PTE_MASK (_AT(long, PHYSICAL_PAGE_MASK))
5# include "page_64.h" 15
6# endif 16#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
17#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
18
19#define HPAGE_SHIFT PMD_SHIFT
20#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
21#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
23
24/* to align the pointer to the (next) page boundary */
25#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
26
27#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
28#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
29
30#ifndef __ASSEMBLY__
31#include <linux/types.h>
32#endif
33
34#ifdef CONFIG_X86_64
35#include <asm/page_64.h>
36#define max_pfn_mapped end_pfn_map
7#else 37#else
8# ifdef __i386__ 38#include <asm/page_32.h>
9# include "page_32.h" 39#define max_pfn_mapped max_low_pfn
10# else 40#endif /* CONFIG_X86_64 */
11# include "page_64.h" 41
12# endif 42#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
43
44#define VM_DATA_DEFAULT_FLAGS \
45 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
46 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
47
48
49#ifndef __ASSEMBLY__
50
51extern int page_is_ram(unsigned long pagenr);
52
53struct page;
54
55static void inline clear_user_page(void *page, unsigned long vaddr,
56 struct page *pg)
57{
58 clear_page(page);
59}
60
61static void inline copy_user_page(void *to, void *from, unsigned long vaddr,
62 struct page *topage)
63{
64 copy_page(to, from);
65}
66
67#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
68 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
69#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
70
71typedef struct { pgdval_t pgd; } pgd_t;
72typedef struct { pgprotval_t pgprot; } pgprot_t;
73
74static inline pgd_t native_make_pgd(pgdval_t val)
75{
76 return (pgd_t) { val };
77}
78
79static inline pgdval_t native_pgd_val(pgd_t pgd)
80{
81 return pgd.pgd;
82}
83
84#if PAGETABLE_LEVELS >= 3
85#if PAGETABLE_LEVELS == 4
86typedef struct { pudval_t pud; } pud_t;
87
88static inline pud_t native_make_pud(pmdval_t val)
89{
90 return (pud_t) { val };
91}
92
93static inline pudval_t native_pud_val(pud_t pud)
94{
95 return pud.pud;
96}
97#else /* PAGETABLE_LEVELS == 3 */
98#include <asm-generic/pgtable-nopud.h>
99
100static inline pudval_t native_pud_val(pud_t pud)
101{
102 return native_pgd_val(pud.pgd);
103}
104#endif /* PAGETABLE_LEVELS == 4 */
105
106typedef struct { pmdval_t pmd; } pmd_t;
107
108static inline pmd_t native_make_pmd(pmdval_t val)
109{
110 return (pmd_t) { val };
111}
112
113static inline pmdval_t native_pmd_val(pmd_t pmd)
114{
115 return pmd.pmd;
116}
117#else /* PAGETABLE_LEVELS == 2 */
118#include <asm-generic/pgtable-nopmd.h>
119
120static inline pmdval_t native_pmd_val(pmd_t pmd)
121{
122 return native_pgd_val(pmd.pud.pgd);
123}
124#endif /* PAGETABLE_LEVELS >= 3 */
125
126static inline pte_t native_make_pte(pteval_t val)
127{
128 return (pte_t) { .pte = val };
129}
130
131static inline pteval_t native_pte_val(pte_t pte)
132{
133 return pte.pte;
134}
135
136#define pgprot_val(x) ((x).pgprot)
137#define __pgprot(x) ((pgprot_t) { (x) } )
138
139#ifdef CONFIG_PARAVIRT
140#include <asm/paravirt.h>
141#else /* !CONFIG_PARAVIRT */
142
143#define pgd_val(x) native_pgd_val(x)
144#define __pgd(x) native_make_pgd(x)
145
146#ifndef __PAGETABLE_PUD_FOLDED
147#define pud_val(x) native_pud_val(x)
148#define __pud(x) native_make_pud(x)
149#endif
150
151#ifndef __PAGETABLE_PMD_FOLDED
152#define pmd_val(x) native_pmd_val(x)
153#define __pmd(x) native_make_pmd(x)
13#endif 154#endif
155
156#define pte_val(x) native_pte_val(x)
157#define __pte(x) native_make_pte(x)
158
159#endif /* CONFIG_PARAVIRT */
160
161#define __pa(x) __phys_addr((unsigned long)(x))
162/* __pa_symbol should be used for C visible symbols.
163 This seems to be the official gcc blessed way to do such arithmetic. */
164#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
165
166#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
167
168#define __boot_va(x) __va(x)
169#define __boot_pa(x) __pa(x)
170
171#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
172#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
173#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
174
175#endif /* __ASSEMBLY__ */
176
177#include <asm-generic/memory_model.h>
178#include <asm-generic/page.h>
179
180#define __HAVE_ARCH_GATE_AREA 1
181
182#endif /* __KERNEL__ */
183#endif /* _ASM_X86_PAGE_H */
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h
index 80ecc66b6d86..a6fd10f230d2 100644
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -1,206 +1,107 @@
1#ifndef _I386_PAGE_H 1#ifndef _ASM_X86_PAGE_32_H
2#define _I386_PAGE_H 2#define _ASM_X86_PAGE_32_H
3
4/* PAGE_SHIFT determines the page size */
5#define PAGE_SHIFT 12
6#define PAGE_SIZE (1UL << PAGE_SHIFT)
7#define PAGE_MASK (~(PAGE_SIZE-1))
8
9#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
10#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
11
12#ifdef __KERNEL__
13#ifndef __ASSEMBLY__
14
15#ifdef CONFIG_X86_USE_3DNOW
16
17#include <asm/mmx.h>
18
19#define clear_page(page) mmx_clear_page((void *)(page))
20#define copy_page(to,from) mmx_copy_page(to,from)
21
22#else
23 3
24/* 4/*
25 * On older X86 processors it's not a win to use MMX here it seems. 5 * This handles the memory map.
26 * Maybe the K6-III ? 6 *
27 */ 7 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
28 8 * a virtual address space of one gigabyte, which limits the
29#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) 9 * amount of physical memory you can use to about 950MB.
30#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) 10 *
31 11 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
32#endif 12 * and CONFIG_HIGHMEM64G options in the kernel configuration.
33
34#define clear_user_page(page, vaddr, pg) clear_page(page)
35#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
36
37#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
38 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
39#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
40
41/*
42 * These are used to make use of C type-checking..
43 */ 13 */
44extern int nx_enabled; 14#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
45 15
46#ifdef CONFIG_X86_PAE 16#ifdef CONFIG_X86_PAE
47typedef struct { unsigned long pte_low, pte_high; } pte_t; 17#define __PHYSICAL_MASK_SHIFT 36
48typedef struct { unsigned long long pmd; } pmd_t; 18#define __VIRTUAL_MASK_SHIFT 32
49typedef struct { unsigned long long pgd; } pgd_t; 19#define PAGETABLE_LEVELS 3
50typedef struct { unsigned long long pgprot; } pgprot_t;
51 20
52static inline unsigned long long native_pgd_val(pgd_t pgd) 21#ifndef __ASSEMBLY__
53{ 22typedef u64 pteval_t;
54 return pgd.pgd; 23typedef u64 pmdval_t;
55} 24typedef u64 pudval_t;
56 25typedef u64 pgdval_t;
57static inline unsigned long long native_pmd_val(pmd_t pmd) 26typedef u64 pgprotval_t;
58{ 27typedef u64 phys_addr_t;
59 return pmd.pmd; 28
60} 29typedef union {
61 30 struct {
62static inline unsigned long long native_pte_val(pte_t pte) 31 unsigned long pte_low, pte_high;
63{ 32 };
64 return pte.pte_low | ((unsigned long long)pte.pte_high << 32); 33 pteval_t pte;
65} 34} pte_t;
66 35#endif /* __ASSEMBLY__
67static inline pgd_t native_make_pgd(unsigned long long val) 36 */
68{
69 return (pgd_t) { val };
70}
71
72static inline pmd_t native_make_pmd(unsigned long long val)
73{
74 return (pmd_t) { val };
75}
76
77static inline pte_t native_make_pte(unsigned long long val)
78{
79 return (pte_t) { .pte_low = val, .pte_high = (val >> 32) } ;
80}
81
82#ifndef CONFIG_PARAVIRT
83#define pmd_val(x) native_pmd_val(x)
84#define __pmd(x) native_make_pmd(x)
85#endif
86
87#define HPAGE_SHIFT 21
88#include <asm-generic/pgtable-nopud.h>
89#else /* !CONFIG_X86_PAE */ 37#else /* !CONFIG_X86_PAE */
90typedef struct { unsigned long pte_low; } pte_t; 38#define __PHYSICAL_MASK_SHIFT 32
91typedef struct { unsigned long pgd; } pgd_t; 39#define __VIRTUAL_MASK_SHIFT 32
92typedef struct { unsigned long pgprot; } pgprot_t; 40#define PAGETABLE_LEVELS 2
93#define boot_pte_t pte_t /* or would you rather have a typedef */
94
95static inline unsigned long native_pgd_val(pgd_t pgd)
96{
97 return pgd.pgd;
98}
99 41
100static inline unsigned long native_pte_val(pte_t pte) 42#ifndef __ASSEMBLY__
101{ 43typedef unsigned long pteval_t;
102 return pte.pte_low; 44typedef unsigned long pmdval_t;
103} 45typedef unsigned long pudval_t;
104 46typedef unsigned long pgdval_t;
105static inline pgd_t native_make_pgd(unsigned long val) 47typedef unsigned long pgprotval_t;
106{ 48typedef unsigned long phys_addr_t;
107 return (pgd_t) { val };
108}
109 49
110static inline pte_t native_make_pte(unsigned long val) 50typedef union { pteval_t pte, pte_low; } pte_t;
111{ 51typedef pte_t boot_pte_t;
112 return (pte_t) { .pte_low = val };
113}
114 52
115#define HPAGE_SHIFT 22 53#endif /* __ASSEMBLY__ */
116#include <asm-generic/pgtable-nopmd.h>
117#endif /* CONFIG_X86_PAE */ 54#endif /* CONFIG_X86_PAE */
118 55
119#define PTE_MASK PAGE_MASK
120
121#ifdef CONFIG_HUGETLB_PAGE 56#ifdef CONFIG_HUGETLB_PAGE
122#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
123#define HPAGE_MASK (~(HPAGE_SIZE - 1))
124#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
125#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA 57#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
126#endif 58#endif
127 59
128#define pgprot_val(x) ((x).pgprot)
129#define __pgprot(x) ((pgprot_t) { (x) } )
130
131#ifndef CONFIG_PARAVIRT
132#define pgd_val(x) native_pgd_val(x)
133#define __pgd(x) native_make_pgd(x)
134#define pte_val(x) native_pte_val(x)
135#define __pte(x) native_make_pte(x)
136#endif
137
138#endif /* !__ASSEMBLY__ */
139
140/* to align the pointer to the (next) page boundary */
141#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
142
143/*
144 * This handles the memory map.. We could make this a config
145 * option, but too many people screw it up, and too few need
146 * it.
147 *
148 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
149 * a virtual address space of one gigabyte, which limits the
150 * amount of physical memory you can use to about 950MB.
151 *
152 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
153 * and CONFIG_HIGHMEM64G options in the kernel configuration.
154 */
155
156#ifndef __ASSEMBLY__ 60#ifndef __ASSEMBLY__
61#define __phys_addr(x) ((x)-PAGE_OFFSET)
62#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
63
64#ifdef CONFIG_FLATMEM
65#define pfn_valid(pfn) ((pfn) < max_mapnr)
66#endif /* CONFIG_FLATMEM */
157 67
158struct vm_area_struct; 68extern int nx_enabled;
159 69
160/* 70/*
161 * This much address space is reserved for vmalloc() and iomap() 71 * This much address space is reserved for vmalloc() and iomap()
162 * as well as fixmap mappings. 72 * as well as fixmap mappings.
163 */ 73 */
164extern unsigned int __VMALLOC_RESERVE; 74extern unsigned int __VMALLOC_RESERVE;
165
166extern int sysctl_legacy_va_layout; 75extern int sysctl_legacy_va_layout;
167 76
168extern int page_is_ram(unsigned long pagenr);
169
170#endif /* __ASSEMBLY__ */
171
172#ifdef __ASSEMBLY__
173#define __PAGE_OFFSET CONFIG_PAGE_OFFSET
174#else
175#define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET)
176#endif
177
178
179#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
180#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) 77#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
181#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) 78#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE)
182#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
183/* __pa_symbol should be used for C visible symbols.
184 This seems to be the official gcc blessed way to do such arithmetic. */
185#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x),0))
186#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
187#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
188#ifdef CONFIG_FLATMEM
189#define pfn_valid(pfn) ((pfn) < max_mapnr)
190#endif /* CONFIG_FLATMEM */
191#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
192 79
193#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) 80#ifdef CONFIG_X86_USE_3DNOW
81#include <asm/mmx.h>
82
83static inline void clear_page(void *page)
84{
85 mmx_clear_page(page);
86}
194 87
195#define VM_DATA_DEFAULT_FLAGS \ 88static inline void copy_page(void *to, void *from)
196 (VM_READ | VM_WRITE | \ 89{
197 ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ 90 mmx_copy_page(to, from);
198 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) 91}
92#else /* !CONFIG_X86_USE_3DNOW */
93#include <linux/string.h>
199 94
200#include <asm-generic/memory_model.h> 95static inline void clear_page(void *page)
201#include <asm-generic/page.h> 96{
97 memset(page, 0, PAGE_SIZE);
98}
202 99
203#define __HAVE_ARCH_GATE_AREA 1 100static inline void copy_page(void *to, void *from)
204#endif /* __KERNEL__ */ 101{
102 memcpy(to, from, PAGE_SIZE);
103}
104#endif /* CONFIG_X86_3DNOW */
105#endif /* !__ASSEMBLY__ */
205 106
206#endif /* _I386_PAGE_H */ 107#endif /* _ASM_X86_PAGE_32_H */
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index c3b52bcb171e..c1ac42d8707f 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -1,15 +1,9 @@
1#ifndef _X86_64_PAGE_H 1#ifndef _X86_64_PAGE_H
2#define _X86_64_PAGE_H 2#define _X86_64_PAGE_H
3 3
4#include <linux/const.h> 4#define PAGETABLE_LEVELS 4
5 5
6/* PAGE_SHIFT determines the page size */ 6#define THREAD_ORDER 1
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
11
12#define THREAD_ORDER 1
13#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) 7#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
14#define CURRENT_MASK (~(THREAD_SIZE-1)) 8#define CURRENT_MASK (~(THREAD_SIZE-1))
15 9
@@ -29,54 +23,7 @@
29#define MCE_STACK 5 23#define MCE_STACK 5
30#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ 24#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
31 25
32#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) 26#define __PAGE_OFFSET _AC(0xffff810000000000, UL)
33#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
34
35#define HPAGE_SHIFT PMD_SHIFT
36#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
37#define HPAGE_MASK (~(HPAGE_SIZE - 1))
38#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
39
40#ifdef __KERNEL__
41#ifndef __ASSEMBLY__
42
43extern unsigned long end_pfn;
44
45void clear_page(void *);
46void copy_page(void *, void *);
47
48#define clear_user_page(page, vaddr, pg) clear_page(page)
49#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
50
51#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
52 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
53#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
54/*
55 * These are used to make use of C type-checking..
56 */
57typedef struct { unsigned long pte; } pte_t;
58typedef struct { unsigned long pmd; } pmd_t;
59typedef struct { unsigned long pud; } pud_t;
60typedef struct { unsigned long pgd; } pgd_t;
61#define PTE_MASK PHYSICAL_PAGE_MASK
62
63typedef struct { unsigned long pgprot; } pgprot_t;
64
65extern unsigned long phys_base;
66
67#define pte_val(x) ((x).pte)
68#define pmd_val(x) ((x).pmd)
69#define pud_val(x) ((x).pud)
70#define pgd_val(x) ((x).pgd)
71#define pgprot_val(x) ((x).pgprot)
72
73#define __pte(x) ((pte_t) { (x) } )
74#define __pmd(x) ((pmd_t) { (x) } )
75#define __pud(x) ((pud_t) { (x) } )
76#define __pgd(x) ((pgd_t) { (x) } )
77#define __pgprot(x) ((pgprot_t) { (x) } )
78
79#endif /* !__ASSEMBLY__ */
80 27
81#define __PHYSICAL_START CONFIG_PHYSICAL_START 28#define __PHYSICAL_START CONFIG_PHYSICAL_START
82#define __KERNEL_ALIGN 0x200000 29#define __KERNEL_ALIGN 0x200000
@@ -92,53 +39,44 @@ extern unsigned long phys_base;
92 39
93#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) 40#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
94#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 41#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
95#define __PAGE_OFFSET _AC(0xffff810000000000, UL)
96
97/* to align the pointer to the (next) page boundary */
98#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
99 42
100/* See Documentation/x86_64/mm.txt for a description of the memory map. */ 43/* See Documentation/x86_64/mm.txt for a description of the memory map. */
101#define __PHYSICAL_MASK_SHIFT 46 44#define __PHYSICAL_MASK_SHIFT 46
102#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
103#define __VIRTUAL_MASK_SHIFT 48 45#define __VIRTUAL_MASK_SHIFT 48
104#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
105 46
106#define KERNEL_TEXT_SIZE (40*1024*1024) 47#define KERNEL_TEXT_SIZE (40*1024*1024)
107#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) 48#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
108#define PAGE_OFFSET __PAGE_OFFSET
109 49
110#ifndef __ASSEMBLY__ 50#ifndef __ASSEMBLY__
51void clear_page(void *page);
52void copy_page(void *to, void *from);
111 53
112#include <asm/bug.h> 54extern unsigned long end_pfn;
55extern unsigned long end_pfn_map;
56extern unsigned long phys_base;
113 57
114extern unsigned long __phys_addr(unsigned long); 58extern unsigned long __phys_addr(unsigned long);
59#define __phys_reloc_hide(x) (x)
115 60
116#endif /* __ASSEMBLY__ */ 61/*
117 62 * These are used to make use of C type-checking..
118#define __pa(x) __phys_addr((unsigned long)(x)) 63 */
119#define __pa_symbol(x) __phys_addr((unsigned long)(x)) 64typedef unsigned long pteval_t;
120 65typedef unsigned long pmdval_t;
121#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 66typedef unsigned long pudval_t;
122#define __boot_va(x) __va(x) 67typedef unsigned long pgdval_t;
123#define __boot_pa(x) __pa(x) 68typedef unsigned long pgprotval_t;
124#ifdef CONFIG_FLATMEM 69typedef unsigned long phys_addr_t;
125#define pfn_valid(pfn) ((pfn) < end_pfn)
126#endif
127
128#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
129#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
130#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
131 70
132#define VM_DATA_DEFAULT_FLAGS \ 71typedef struct { pteval_t pte; } pte_t;
133 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
134 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
135 72
136#define __HAVE_ARCH_GATE_AREA 1
137#define vmemmap ((struct page *)VMEMMAP_START) 73#define vmemmap ((struct page *)VMEMMAP_START)
138 74
139#include <asm-generic/memory_model.h> 75#endif /* !__ASSEMBLY__ */
140#include <asm-generic/page.h> 76
77#ifdef CONFIG_FLATMEM
78#define pfn_valid(pfn) ((pfn) < end_pfn)
79#endif
141 80
142#endif /* __KERNEL__ */
143 81
144#endif /* _X86_64_PAGE_H */ 82#endif /* _X86_64_PAGE_H */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index f59d370c5df4..d6236eb46466 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -5,22 +5,37 @@
5 5
6#ifdef CONFIG_PARAVIRT 6#ifdef CONFIG_PARAVIRT
7#include <asm/page.h> 7#include <asm/page.h>
8#include <asm/asm.h>
8 9
9/* Bitmask of what can be clobbered: usually at least eax. */ 10/* Bitmask of what can be clobbered: usually at least eax. */
10#define CLBR_NONE 0x0 11#define CLBR_NONE 0
11#define CLBR_EAX 0x1 12#define CLBR_EAX (1 << 0)
12#define CLBR_ECX 0x2 13#define CLBR_ECX (1 << 1)
13#define CLBR_EDX 0x4 14#define CLBR_EDX (1 << 2)
14#define CLBR_ANY 0x7 15
16#ifdef CONFIG_X86_64
17#define CLBR_RSI (1 << 3)
18#define CLBR_RDI (1 << 4)
19#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8)
23#define CLBR_ANY ((1 << 9) - 1)
24#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */
15 29
16#ifndef __ASSEMBLY__ 30#ifndef __ASSEMBLY__
17#include <linux/types.h> 31#include <linux/types.h>
18#include <linux/cpumask.h> 32#include <linux/cpumask.h>
19#include <asm/kmap_types.h> 33#include <asm/kmap_types.h>
34#include <asm/desc_defs.h>
20 35
21struct page; 36struct page;
22struct thread_struct; 37struct thread_struct;
23struct Xgt_desc_struct; 38struct desc_ptr;
24struct tss_struct; 39struct tss_struct;
25struct mm_struct; 40struct mm_struct;
26struct desc_struct; 41struct desc_struct;
@@ -86,22 +101,27 @@ struct pv_cpu_ops {
86 unsigned long (*read_cr4)(void); 101 unsigned long (*read_cr4)(void);
87 void (*write_cr4)(unsigned long); 102 void (*write_cr4)(unsigned long);
88 103
104#ifdef CONFIG_X86_64
105 unsigned long (*read_cr8)(void);
106 void (*write_cr8)(unsigned long);
107#endif
108
89 /* Segment descriptor handling */ 109 /* Segment descriptor handling */
90 void (*load_tr_desc)(void); 110 void (*load_tr_desc)(void);
91 void (*load_gdt)(const struct Xgt_desc_struct *); 111 void (*load_gdt)(const struct desc_ptr *);
92 void (*load_idt)(const struct Xgt_desc_struct *); 112 void (*load_idt)(const struct desc_ptr *);
93 void (*store_gdt)(struct Xgt_desc_struct *); 113 void (*store_gdt)(struct desc_ptr *);
94 void (*store_idt)(struct Xgt_desc_struct *); 114 void (*store_idt)(struct desc_ptr *);
95 void (*set_ldt)(const void *desc, unsigned entries); 115 void (*set_ldt)(const void *desc, unsigned entries);
96 unsigned long (*store_tr)(void); 116 unsigned long (*store_tr)(void);
97 void (*load_tls)(struct thread_struct *t, unsigned int cpu); 117 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
98 void (*write_ldt_entry)(struct desc_struct *, 118 void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
99 int entrynum, u32 low, u32 high); 119 const void *desc);
100 void (*write_gdt_entry)(struct desc_struct *, 120 void (*write_gdt_entry)(struct desc_struct *,
101 int entrynum, u32 low, u32 high); 121 int entrynum, const void *desc, int size);
102 void (*write_idt_entry)(struct desc_struct *, 122 void (*write_idt_entry)(gate_desc *,
103 int entrynum, u32 low, u32 high); 123 int entrynum, const gate_desc *gate);
104 void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); 124 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
105 125
106 void (*set_iopl_mask)(unsigned mask); 126 void (*set_iopl_mask)(unsigned mask);
107 127
@@ -115,15 +135,18 @@ struct pv_cpu_ops {
115 /* MSR, PMC and TSR operations. 135 /* MSR, PMC and TSR operations.
116 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ 136 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
117 u64 (*read_msr)(unsigned int msr, int *err); 137 u64 (*read_msr)(unsigned int msr, int *err);
118 int (*write_msr)(unsigned int msr, u64 val); 138 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
119 139
120 u64 (*read_tsc)(void); 140 u64 (*read_tsc)(void);
121 u64 (*read_pmc)(void); 141 u64 (*read_pmc)(int counter);
142 unsigned long long (*read_tscp)(unsigned int *aux);
122 143
123 /* These two are jmp to, not actually called. */ 144 /* These two are jmp to, not actually called. */
124 void (*irq_enable_sysexit)(void); 145 void (*irq_enable_syscall_ret)(void);
125 void (*iret)(void); 146 void (*iret)(void);
126 147
148 void (*swapgs)(void);
149
127 struct pv_lazy_ops lazy_mode; 150 struct pv_lazy_ops lazy_mode;
128}; 151};
129 152
@@ -150,9 +173,9 @@ struct pv_apic_ops {
150 * Direct APIC operations, principally for VMI. Ideally 173 * Direct APIC operations, principally for VMI. Ideally
151 * these shouldn't be in this interface. 174 * these shouldn't be in this interface.
152 */ 175 */
153 void (*apic_write)(unsigned long reg, unsigned long v); 176 void (*apic_write)(unsigned long reg, u32 v);
154 void (*apic_write_atomic)(unsigned long reg, unsigned long v); 177 void (*apic_write_atomic)(unsigned long reg, u32 v);
155 unsigned long (*apic_read)(unsigned long reg); 178 u32 (*apic_read)(unsigned long reg);
156 void (*setup_boot_clock)(void); 179 void (*setup_boot_clock)(void);
157 void (*setup_secondary_clock)(void); 180 void (*setup_secondary_clock)(void);
158 181
@@ -198,7 +221,7 @@ struct pv_mmu_ops {
198 221
199 /* Hooks for allocating/releasing pagetable pages */ 222 /* Hooks for allocating/releasing pagetable pages */
200 void (*alloc_pt)(struct mm_struct *mm, u32 pfn); 223 void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
201 void (*alloc_pd)(u32 pfn); 224 void (*alloc_pd)(struct mm_struct *mm, u32 pfn);
202 void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); 225 void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
203 void (*release_pt)(u32 pfn); 226 void (*release_pt)(u32 pfn);
204 void (*release_pd)(u32 pfn); 227 void (*release_pd)(u32 pfn);
@@ -212,28 +235,34 @@ struct pv_mmu_ops {
212 void (*pte_update_defer)(struct mm_struct *mm, 235 void (*pte_update_defer)(struct mm_struct *mm,
213 unsigned long addr, pte_t *ptep); 236 unsigned long addr, pte_t *ptep);
214 237
238 pteval_t (*pte_val)(pte_t);
239 pte_t (*make_pte)(pteval_t pte);
240
241 pgdval_t (*pgd_val)(pgd_t);
242 pgd_t (*make_pgd)(pgdval_t pgd);
243
244#if PAGETABLE_LEVELS >= 3
215#ifdef CONFIG_X86_PAE 245#ifdef CONFIG_X86_PAE
216 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 246 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
217 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, 247 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
218 pte_t *ptep, pte_t pte); 248 pte_t *ptep, pte_t pte);
219 void (*set_pud)(pud_t *pudp, pud_t pudval);
220 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 249 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
221 void (*pmd_clear)(pmd_t *pmdp); 250 void (*pmd_clear)(pmd_t *pmdp);
222 251
223 unsigned long long (*pte_val)(pte_t); 252#endif /* CONFIG_X86_PAE */
224 unsigned long long (*pmd_val)(pmd_t);
225 unsigned long long (*pgd_val)(pgd_t);
226 253
227 pte_t (*make_pte)(unsigned long long pte); 254 void (*set_pud)(pud_t *pudp, pud_t pudval);
228 pmd_t (*make_pmd)(unsigned long long pmd);
229 pgd_t (*make_pgd)(unsigned long long pgd);
230#else
231 unsigned long (*pte_val)(pte_t);
232 unsigned long (*pgd_val)(pgd_t);
233 255
234 pte_t (*make_pte)(unsigned long pte); 256 pmdval_t (*pmd_val)(pmd_t);
235 pgd_t (*make_pgd)(unsigned long pgd); 257 pmd_t (*make_pmd)(pmdval_t pmd);
236#endif 258
259#if PAGETABLE_LEVELS == 4
260 pudval_t (*pud_val)(pud_t);
261 pud_t (*make_pud)(pudval_t pud);
262
263 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
264#endif /* PAGETABLE_LEVELS == 4 */
265#endif /* PAGETABLE_LEVELS >= 3 */
237 266
238#ifdef CONFIG_HIGHPTE 267#ifdef CONFIG_HIGHPTE
239 void *(*kmap_atomic_pte)(struct page *page, enum km_type type); 268 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
@@ -279,7 +308,8 @@ extern struct pv_mmu_ops pv_mmu_ops;
279#define _paravirt_alt(insn_string, type, clobber) \ 308#define _paravirt_alt(insn_string, type, clobber) \
280 "771:\n\t" insn_string "\n" "772:\n" \ 309 "771:\n\t" insn_string "\n" "772:\n" \
281 ".pushsection .parainstructions,\"a\"\n" \ 310 ".pushsection .parainstructions,\"a\"\n" \
282 " .long 771b\n" \ 311 _ASM_ALIGN "\n" \
312 _ASM_PTR " 771b\n" \
283 " .byte " type "\n" \ 313 " .byte " type "\n" \
284 " .byte 772b-771b\n" \ 314 " .byte 772b-771b\n" \
285 " .short " clobber "\n" \ 315 " .short " clobber "\n" \
@@ -289,6 +319,11 @@ extern struct pv_mmu_ops pv_mmu_ops;
289#define paravirt_alt(insn_string) \ 319#define paravirt_alt(insn_string) \
290 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") 320 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
291 321
322/* Simple instruction patching code. */
323#define DEF_NATIVE(ops, name, code) \
324 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
325 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
326
292unsigned paravirt_patch_nop(void); 327unsigned paravirt_patch_nop(void);
293unsigned paravirt_patch_ignore(unsigned len); 328unsigned paravirt_patch_ignore(unsigned len);
294unsigned paravirt_patch_call(void *insnbuf, 329unsigned paravirt_patch_call(void *insnbuf,
@@ -303,6 +338,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
303unsigned paravirt_patch_insns(void *insnbuf, unsigned len, 338unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
304 const char *start, const char *end); 339 const char *start, const char *end);
305 340
341unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
342 unsigned long addr, unsigned len);
343
306int paravirt_disable_iospace(void); 344int paravirt_disable_iospace(void);
307 345
308/* 346/*
@@ -319,7 +357,7 @@ int paravirt_disable_iospace(void);
319 * runtime. 357 * runtime.
320 * 358 *
321 * Normally, a call to a pv_op function is a simple indirect call: 359 * Normally, a call to a pv_op function is a simple indirect call:
322 * (paravirt_ops.operations)(args...). 360 * (pv_op_struct.operations)(args...).
323 * 361 *
324 * Unfortunately, this is a relatively slow operation for modern CPUs, 362 * Unfortunately, this is a relatively slow operation for modern CPUs,
325 * because it cannot necessarily determine what the destination 363 * because it cannot necessarily determine what the destination
@@ -329,11 +367,17 @@ int paravirt_disable_iospace(void);
329 * calls are essentially free, because the call and return addresses 367 * calls are essentially free, because the call and return addresses
330 * are completely predictable.) 368 * are completely predictable.)
331 * 369 *
332 * These macros rely on the standard gcc "regparm(3)" calling 370 * For i386, these macros rely on the standard gcc "regparm(3)" calling
333 * convention, in which the first three arguments are placed in %eax, 371 * convention, in which the first three arguments are placed in %eax,
334 * %edx, %ecx (in that order), and the remaining arguments are placed 372 * %edx, %ecx (in that order), and the remaining arguments are placed
335 * on the stack. All caller-save registers (eax,edx,ecx) are expected 373 * on the stack. All caller-save registers (eax,edx,ecx) are expected
336 * to be modified (either clobbered or used for return values). 374 * to be modified (either clobbered or used for return values).
375 * X86_64, on the other hand, already specifies a register-based calling
376 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
377 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
378 * special handling for dealing with 4 arguments, unlike i386.
379 * However, x86_64 also have to clobber all caller saved registers, which
380 * unfortunately, are quite a bit (r8 - r11)
337 * 381 *
338 * The call instruction itself is marked by placing its start address 382 * The call instruction itself is marked by placing its start address
339 * and size into the .parainstructions section, so that 383 * and size into the .parainstructions section, so that
@@ -356,10 +400,12 @@ int paravirt_disable_iospace(void);
356 * the return type. The macro then uses sizeof() on that type to 400 * the return type. The macro then uses sizeof() on that type to
357 * determine whether its a 32 or 64 bit value, and places the return 401 * determine whether its a 32 or 64 bit value, and places the return
358 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for 402 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
359 * 64-bit). 403 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
404 * the return value size.
360 * 405 *
361 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments 406 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
362 * in low,high order. 407 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
408 * in low,high order
363 * 409 *
364 * Small structures are passed and returned in registers. The macro 410 * Small structures are passed and returned in registers. The macro
365 * calling convention can't directly deal with this, so the wrapper 411 * calling convention can't directly deal with this, so the wrapper
@@ -369,46 +415,67 @@ int paravirt_disable_iospace(void);
369 * means that all uses must be wrapped in inline functions. This also 415 * means that all uses must be wrapped in inline functions. This also
370 * makes sure the incoming and outgoing types are always correct. 416 * makes sure the incoming and outgoing types are always correct.
371 */ 417 */
418#ifdef CONFIG_X86_32
419#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
420#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
421#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
422 "=c" (__ecx)
423#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
424#define EXTRA_CLOBBERS
425#define VEXTRA_CLOBBERS
426#else
427#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
428#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
429#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
430 "=S" (__esi), "=d" (__edx), \
431 "=c" (__ecx)
432
433#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
434
435#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
436#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
437#endif
438
372#define __PVOP_CALL(rettype, op, pre, post, ...) \ 439#define __PVOP_CALL(rettype, op, pre, post, ...) \
373 ({ \ 440 ({ \
374 rettype __ret; \ 441 rettype __ret; \
375 unsigned long __eax, __edx, __ecx; \ 442 PVOP_CALL_ARGS; \
443 /* This is 32-bit specific, but is okay in 64-bit */ \
444 /* since this condition will never hold */ \
376 if (sizeof(rettype) > sizeof(unsigned long)) { \ 445 if (sizeof(rettype) > sizeof(unsigned long)) { \
377 asm volatile(pre \ 446 asm volatile(pre \
378 paravirt_alt(PARAVIRT_CALL) \ 447 paravirt_alt(PARAVIRT_CALL) \
379 post \ 448 post \
380 : "=a" (__eax), "=d" (__edx), \ 449 : PVOP_CALL_CLOBBERS \
381 "=c" (__ecx) \
382 : paravirt_type(op), \ 450 : paravirt_type(op), \
383 paravirt_clobber(CLBR_ANY), \ 451 paravirt_clobber(CLBR_ANY), \
384 ##__VA_ARGS__ \ 452 ##__VA_ARGS__ \
385 : "memory", "cc"); \ 453 : "memory", "cc" EXTRA_CLOBBERS); \
386 __ret = (rettype)((((u64)__edx) << 32) | __eax); \ 454 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
387 } else { \ 455 } else { \
388 asm volatile(pre \ 456 asm volatile(pre \
389 paravirt_alt(PARAVIRT_CALL) \ 457 paravirt_alt(PARAVIRT_CALL) \
390 post \ 458 post \
391 : "=a" (__eax), "=d" (__edx), \ 459 : PVOP_CALL_CLOBBERS \
392 "=c" (__ecx) \
393 : paravirt_type(op), \ 460 : paravirt_type(op), \
394 paravirt_clobber(CLBR_ANY), \ 461 paravirt_clobber(CLBR_ANY), \
395 ##__VA_ARGS__ \ 462 ##__VA_ARGS__ \
396 : "memory", "cc"); \ 463 : "memory", "cc" EXTRA_CLOBBERS); \
397 __ret = (rettype)__eax; \ 464 __ret = (rettype)__eax; \
398 } \ 465 } \
399 __ret; \ 466 __ret; \
400 }) 467 })
401#define __PVOP_VCALL(op, pre, post, ...) \ 468#define __PVOP_VCALL(op, pre, post, ...) \
402 ({ \ 469 ({ \
403 unsigned long __eax, __edx, __ecx; \ 470 PVOP_VCALL_ARGS; \
404 asm volatile(pre \ 471 asm volatile(pre \
405 paravirt_alt(PARAVIRT_CALL) \ 472 paravirt_alt(PARAVIRT_CALL) \
406 post \ 473 post \
407 : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ 474 : PVOP_VCALL_CLOBBERS \
408 : paravirt_type(op), \ 475 : paravirt_type(op), \
409 paravirt_clobber(CLBR_ANY), \ 476 paravirt_clobber(CLBR_ANY), \
410 ##__VA_ARGS__ \ 477 ##__VA_ARGS__ \
411 : "memory", "cc"); \ 478 : "memory", "cc" VEXTRA_CLOBBERS); \
412 }) 479 })
413 480
414#define PVOP_CALL0(rettype, op) \ 481#define PVOP_CALL0(rettype, op) \
@@ -417,22 +484,26 @@ int paravirt_disable_iospace(void);
417 __PVOP_VCALL(op, "", "") 484 __PVOP_VCALL(op, "", "")
418 485
419#define PVOP_CALL1(rettype, op, arg1) \ 486#define PVOP_CALL1(rettype, op, arg1) \
420 __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1))) 487 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
421#define PVOP_VCALL1(op, arg1) \ 488#define PVOP_VCALL1(op, arg1) \
422 __PVOP_VCALL(op, "", "", "0" ((u32)(arg1))) 489 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
423 490
424#define PVOP_CALL2(rettype, op, arg1, arg2) \ 491#define PVOP_CALL2(rettype, op, arg1, arg2) \
425 __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2))) 492 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
493 "1" ((unsigned long)(arg2)))
426#define PVOP_VCALL2(op, arg1, arg2) \ 494#define PVOP_VCALL2(op, arg1, arg2) \
427 __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2))) 495 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
496 "1" ((unsigned long)(arg2)))
428 497
429#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ 498#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
430 __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), \ 499 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
431 "1"((u32)(arg2)), "2"((u32)(arg3))) 500 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
432#define PVOP_VCALL3(op, arg1, arg2, arg3) \ 501#define PVOP_VCALL3(op, arg1, arg2, arg3) \
433 __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)), \ 502 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
434 "2"((u32)(arg3))) 503 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
435 504
505/* This is the only difference in x86_64. We can make it much simpler */
506#ifdef CONFIG_X86_32
436#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 507#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
437 __PVOP_CALL(rettype, op, \ 508 __PVOP_CALL(rettype, op, \
438 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 509 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -443,16 +514,26 @@ int paravirt_disable_iospace(void);
443 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 514 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
444 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ 515 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
445 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 516 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
517#else
518#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
519 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
520 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
521 "3"((unsigned long)(arg4)))
522#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
523 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
524 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
525 "3"((unsigned long)(arg4)))
526#endif
446 527
447static inline int paravirt_enabled(void) 528static inline int paravirt_enabled(void)
448{ 529{
449 return pv_info.paravirt_enabled; 530 return pv_info.paravirt_enabled;
450} 531}
451 532
452static inline void load_esp0(struct tss_struct *tss, 533static inline void load_sp0(struct tss_struct *tss,
453 struct thread_struct *thread) 534 struct thread_struct *thread)
454{ 535{
455 PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread); 536 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
456} 537}
457 538
458#define ARCH_SETUP pv_init_ops.arch_setup(); 539#define ARCH_SETUP pv_init_ops.arch_setup();
@@ -540,6 +621,18 @@ static inline void write_cr4(unsigned long x)
540 PVOP_VCALL1(pv_cpu_ops.write_cr4, x); 621 PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
541} 622}
542 623
624#ifdef CONFIG_X86_64
625static inline unsigned long read_cr8(void)
626{
627 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
628}
629
630static inline void write_cr8(unsigned long x)
631{
632 PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
633}
634#endif
635
543static inline void raw_safe_halt(void) 636static inline void raw_safe_halt(void)
544{ 637{
545 PVOP_VCALL0(pv_irq_ops.safe_halt); 638 PVOP_VCALL0(pv_irq_ops.safe_halt);
@@ -613,8 +706,6 @@ static inline unsigned long long paravirt_sched_clock(void)
613} 706}
614#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz()) 707#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
615 708
616#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
617
618static inline unsigned long long paravirt_read_pmc(int counter) 709static inline unsigned long long paravirt_read_pmc(int counter)
619{ 710{
620 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); 711 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
@@ -626,15 +717,36 @@ static inline unsigned long long paravirt_read_pmc(int counter)
626 high = _l >> 32; \ 717 high = _l >> 32; \
627} while(0) 718} while(0)
628 719
720static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
721{
722 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
723}
724
725#define rdtscp(low, high, aux) \
726do { \
727 int __aux; \
728 unsigned long __val = paravirt_rdtscp(&__aux); \
729 (low) = (u32)__val; \
730 (high) = (u32)(__val >> 32); \
731 (aux) = __aux; \
732} while (0)
733
734#define rdtscpll(val, aux) \
735do { \
736 unsigned long __aux; \
737 val = paravirt_rdtscp(&__aux); \
738 (aux) = __aux; \
739} while (0)
740
629static inline void load_TR_desc(void) 741static inline void load_TR_desc(void)
630{ 742{
631 PVOP_VCALL0(pv_cpu_ops.load_tr_desc); 743 PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
632} 744}
633static inline void load_gdt(const struct Xgt_desc_struct *dtr) 745static inline void load_gdt(const struct desc_ptr *dtr)
634{ 746{
635 PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr); 747 PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
636} 748}
637static inline void load_idt(const struct Xgt_desc_struct *dtr) 749static inline void load_idt(const struct desc_ptr *dtr)
638{ 750{
639 PVOP_VCALL1(pv_cpu_ops.load_idt, dtr); 751 PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
640} 752}
@@ -642,11 +754,11 @@ static inline void set_ldt(const void *addr, unsigned entries)
642{ 754{
643 PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); 755 PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
644} 756}
645static inline void store_gdt(struct Xgt_desc_struct *dtr) 757static inline void store_gdt(struct desc_ptr *dtr)
646{ 758{
647 PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); 759 PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
648} 760}
649static inline void store_idt(struct Xgt_desc_struct *dtr) 761static inline void store_idt(struct desc_ptr *dtr)
650{ 762{
651 PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); 763 PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
652} 764}
@@ -659,17 +771,22 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu)
659{ 771{
660 PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu); 772 PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
661} 773}
662static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high) 774
775static inline void write_ldt_entry(struct desc_struct *dt, int entry,
776 const void *desc)
663{ 777{
664 PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high); 778 PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
665} 779}
666static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high) 780
781static inline void write_gdt_entry(struct desc_struct *dt, int entry,
782 void *desc, int type)
667{ 783{
668 PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high); 784 PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
669} 785}
670static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high) 786
787static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
671{ 788{
672 PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high); 789 PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
673} 790}
674static inline void set_iopl_mask(unsigned mask) 791static inline void set_iopl_mask(unsigned mask)
675{ 792{
@@ -690,17 +807,17 @@ static inline void slow_down_io(void) {
690/* 807/*
691 * Basic functions accessing APICs. 808 * Basic functions accessing APICs.
692 */ 809 */
693static inline void apic_write(unsigned long reg, unsigned long v) 810static inline void apic_write(unsigned long reg, u32 v)
694{ 811{
695 PVOP_VCALL2(pv_apic_ops.apic_write, reg, v); 812 PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
696} 813}
697 814
698static inline void apic_write_atomic(unsigned long reg, unsigned long v) 815static inline void apic_write_atomic(unsigned long reg, u32 v)
699{ 816{
700 PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v); 817 PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
701} 818}
702 819
703static inline unsigned long apic_read(unsigned long reg) 820static inline u32 apic_read(unsigned long reg)
704{ 821{
705 return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg); 822 return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
706} 823}
@@ -786,9 +903,9 @@ static inline void paravirt_release_pt(unsigned pfn)
786 PVOP_VCALL1(pv_mmu_ops.release_pt, pfn); 903 PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
787} 904}
788 905
789static inline void paravirt_alloc_pd(unsigned pfn) 906static inline void paravirt_alloc_pd(struct mm_struct *mm, unsigned pfn)
790{ 907{
791 PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn); 908 PVOP_VCALL2(pv_mmu_ops.alloc_pd, mm, pfn);
792} 909}
793 910
794static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn, 911static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
@@ -822,128 +939,236 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
822 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 939 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
823} 940}
824 941
825#ifdef CONFIG_X86_PAE 942static inline pte_t __pte(pteval_t val)
826static inline pte_t __pte(unsigned long long val)
827{ 943{
828 unsigned long long ret = PVOP_CALL2(unsigned long long, 944 pteval_t ret;
829 pv_mmu_ops.make_pte, 945
830 val, val >> 32); 946 if (sizeof(pteval_t) > sizeof(long))
831 return (pte_t) { ret, ret >> 32 }; 947 ret = PVOP_CALL2(pteval_t,
948 pv_mmu_ops.make_pte,
949 val, (u64)val >> 32);
950 else
951 ret = PVOP_CALL1(pteval_t,
952 pv_mmu_ops.make_pte,
953 val);
954
955 return (pte_t) { .pte = ret };
832} 956}
833 957
834static inline pmd_t __pmd(unsigned long long val) 958static inline pteval_t pte_val(pte_t pte)
835{ 959{
836 return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd, 960 pteval_t ret;
837 val, val >> 32) }; 961
962 if (sizeof(pteval_t) > sizeof(long))
963 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
964 pte.pte, (u64)pte.pte >> 32);
965 else
966 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
967 pte.pte);
968
969 return ret;
838} 970}
839 971
840static inline pgd_t __pgd(unsigned long long val) 972static inline pgd_t __pgd(pgdval_t val)
841{ 973{
842 return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd, 974 pgdval_t ret;
843 val, val >> 32) }; 975
976 if (sizeof(pgdval_t) > sizeof(long))
977 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
978 val, (u64)val >> 32);
979 else
980 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
981 val);
982
983 return (pgd_t) { ret };
844} 984}
845 985
846static inline unsigned long long pte_val(pte_t x) 986static inline pgdval_t pgd_val(pgd_t pgd)
847{ 987{
848 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val, 988 pgdval_t ret;
849 x.pte_low, x.pte_high); 989
990 if (sizeof(pgdval_t) > sizeof(long))
991 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
992 pgd.pgd, (u64)pgd.pgd >> 32);
993 else
994 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
995 pgd.pgd);
996
997 return ret;
850} 998}
851 999
852static inline unsigned long long pmd_val(pmd_t x) 1000static inline void set_pte(pte_t *ptep, pte_t pte)
853{ 1001{
854 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val, 1002 if (sizeof(pteval_t) > sizeof(long))
855 x.pmd, x.pmd >> 32); 1003 PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
1004 pte.pte, (u64)pte.pte >> 32);
1005 else
1006 PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
1007 pte.pte);
856} 1008}
857 1009
858static inline unsigned long long pgd_val(pgd_t x) 1010static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
1011 pte_t *ptep, pte_t pte)
859{ 1012{
860 return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val, 1013 if (sizeof(pteval_t) > sizeof(long))
861 x.pgd, x.pgd >> 32); 1014 /* 5 arg words */
1015 pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
1016 else
1017 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
862} 1018}
863 1019
864static inline void set_pte(pte_t *ptep, pte_t pteval) 1020static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
865{ 1021{
866 PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high); 1022 pmdval_t val = native_pmd_val(pmd);
1023
1024 if (sizeof(pmdval_t) > sizeof(long))
1025 PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
1026 else
1027 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
867} 1028}
868 1029
869static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 1030#if PAGETABLE_LEVELS >= 3
870 pte_t *ptep, pte_t pteval) 1031static inline pmd_t __pmd(pmdval_t val)
871{ 1032{
872 /* 5 arg words */ 1033 pmdval_t ret;
873 pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval); 1034
1035 if (sizeof(pmdval_t) > sizeof(long))
1036 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
1037 val, (u64)val >> 32);
1038 else
1039 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
1040 val);
1041
1042 return (pmd_t) { ret };
874} 1043}
875 1044
876static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) 1045static inline pmdval_t pmd_val(pmd_t pmd)
877{ 1046{
878 PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep, 1047 pmdval_t ret;
879 pteval.pte_low, pteval.pte_high); 1048
1049 if (sizeof(pmdval_t) > sizeof(long))
1050 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
1051 pmd.pmd, (u64)pmd.pmd >> 32);
1052 else
1053 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
1054 pmd.pmd);
1055
1056 return ret;
880} 1057}
881 1058
882static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, 1059static inline void set_pud(pud_t *pudp, pud_t pud)
883 pte_t *ptep, pte_t pte)
884{ 1060{
885 /* 5 arg words */ 1061 pudval_t val = native_pud_val(pud);
886 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte); 1062
1063 if (sizeof(pudval_t) > sizeof(long))
1064 PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
1065 val, (u64)val >> 32);
1066 else
1067 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
1068 val);
1069}
1070#if PAGETABLE_LEVELS == 4
1071static inline pud_t __pud(pudval_t val)
1072{
1073 pudval_t ret;
1074
1075 if (sizeof(pudval_t) > sizeof(long))
1076 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
1077 val, (u64)val >> 32);
1078 else
1079 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
1080 val);
1081
1082 return (pud_t) { ret };
887} 1083}
888 1084
889static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) 1085static inline pudval_t pud_val(pud_t pud)
890{ 1086{
891 PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, 1087 pudval_t ret;
892 pmdval.pmd, pmdval.pmd >> 32); 1088
1089 if (sizeof(pudval_t) > sizeof(long))
1090 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
1091 pud.pud, (u64)pud.pud >> 32);
1092 else
1093 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
1094 pud.pud);
1095
1096 return ret;
893} 1097}
894 1098
895static inline void set_pud(pud_t *pudp, pud_t pudval) 1099static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
896{ 1100{
897 PVOP_VCALL3(pv_mmu_ops.set_pud, pudp, 1101 pgdval_t val = native_pgd_val(pgd);
898 pudval.pgd.pgd, pudval.pgd.pgd >> 32); 1102
1103 if (sizeof(pgdval_t) > sizeof(long))
1104 PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
1105 val, (u64)val >> 32);
1106 else
1107 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
1108 val);
899} 1109}
900 1110
901static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 1111static inline void pgd_clear(pgd_t *pgdp)
902{ 1112{
903 PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep); 1113 set_pgd(pgdp, __pgd(0));
904} 1114}
905 1115
906static inline void pmd_clear(pmd_t *pmdp) 1116static inline void pud_clear(pud_t *pudp)
907{ 1117{
908 PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp); 1118 set_pud(pudp, __pud(0));
909} 1119}
910 1120
911#else /* !CONFIG_X86_PAE */ 1121#endif /* PAGETABLE_LEVELS == 4 */
912 1122
913static inline pte_t __pte(unsigned long val) 1123#endif /* PAGETABLE_LEVELS >= 3 */
1124
1125#ifdef CONFIG_X86_PAE
1126/* Special-case pte-setting operations for PAE, which can't update a
1127 64-bit pte atomically */
1128static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
914{ 1129{
915 return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) }; 1130 PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
1131 pte.pte, pte.pte >> 32);
916} 1132}
917 1133
918static inline pgd_t __pgd(unsigned long val) 1134static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1135 pte_t *ptep, pte_t pte)
919{ 1136{
920 return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) }; 1137 /* 5 arg words */
1138 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
921} 1139}
922 1140
923static inline unsigned long pte_val(pte_t x) 1141static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1142 pte_t *ptep)
924{ 1143{
925 return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low); 1144 PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
926} 1145}
927 1146
928static inline unsigned long pgd_val(pgd_t x) 1147static inline void pmd_clear(pmd_t *pmdp)
1148{
1149 PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
1150}
1151#else /* !CONFIG_X86_PAE */
1152static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
929{ 1153{
930 return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd); 1154 set_pte(ptep, pte);
931} 1155}
932 1156
933static inline void set_pte(pte_t *ptep, pte_t pteval) 1157static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1158 pte_t *ptep, pte_t pte)
934{ 1159{
935 PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low); 1160 set_pte(ptep, pte);
936} 1161}
937 1162
938static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 1163static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
939 pte_t *ptep, pte_t pteval) 1164 pte_t *ptep)
940{ 1165{
941 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low); 1166 set_pte_at(mm, addr, ptep, __pte(0));
942} 1167}
943 1168
944static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) 1169static inline void pmd_clear(pmd_t *pmdp)
945{ 1170{
946 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd); 1171 set_pmd(pmdp, __pmd(0));
947} 1172}
948#endif /* CONFIG_X86_PAE */ 1173#endif /* CONFIG_X86_PAE */
949 1174
@@ -1014,52 +1239,68 @@ struct paravirt_patch_site {
1014extern struct paravirt_patch_site __parainstructions[], 1239extern struct paravirt_patch_site __parainstructions[],
1015 __parainstructions_end[]; 1240 __parainstructions_end[];
1016 1241
1242#ifdef CONFIG_X86_32
1243#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
1244#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
1245#define PV_FLAGS_ARG "0"
1246#define PV_EXTRA_CLOBBERS
1247#define PV_VEXTRA_CLOBBERS
1248#else
1249/* We save some registers, but all of them, that's too much. We clobber all
1250 * caller saved registers but the argument parameter */
1251#define PV_SAVE_REGS "pushq %%rdi;"
1252#define PV_RESTORE_REGS "popq %%rdi;"
1253#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
1254#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
1255#define PV_FLAGS_ARG "D"
1256#endif
1257
1017static inline unsigned long __raw_local_save_flags(void) 1258static inline unsigned long __raw_local_save_flags(void)
1018{ 1259{
1019 unsigned long f; 1260 unsigned long f;
1020 1261
1021 asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" 1262 asm volatile(paravirt_alt(PV_SAVE_REGS
1022 PARAVIRT_CALL 1263 PARAVIRT_CALL
1023 "popl %%edx; popl %%ecx") 1264 PV_RESTORE_REGS)
1024 : "=a"(f) 1265 : "=a"(f)
1025 : paravirt_type(pv_irq_ops.save_fl), 1266 : paravirt_type(pv_irq_ops.save_fl),
1026 paravirt_clobber(CLBR_EAX) 1267 paravirt_clobber(CLBR_EAX)
1027 : "memory", "cc"); 1268 : "memory", "cc" PV_VEXTRA_CLOBBERS);
1028 return f; 1269 return f;
1029} 1270}
1030 1271
1031static inline void raw_local_irq_restore(unsigned long f) 1272static inline void raw_local_irq_restore(unsigned long f)
1032{ 1273{
1033 asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" 1274 asm volatile(paravirt_alt(PV_SAVE_REGS
1034 PARAVIRT_CALL 1275 PARAVIRT_CALL
1035 "popl %%edx; popl %%ecx") 1276 PV_RESTORE_REGS)
1036 : "=a"(f) 1277 : "=a"(f)
1037 : "0"(f), 1278 : PV_FLAGS_ARG(f),
1038 paravirt_type(pv_irq_ops.restore_fl), 1279 paravirt_type(pv_irq_ops.restore_fl),
1039 paravirt_clobber(CLBR_EAX) 1280 paravirt_clobber(CLBR_EAX)
1040 : "memory", "cc"); 1281 : "memory", "cc" PV_EXTRA_CLOBBERS);
1041} 1282}
1042 1283
1043static inline void raw_local_irq_disable(void) 1284static inline void raw_local_irq_disable(void)
1044{ 1285{
1045 asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" 1286 asm volatile(paravirt_alt(PV_SAVE_REGS
1046 PARAVIRT_CALL 1287 PARAVIRT_CALL
1047 "popl %%edx; popl %%ecx") 1288 PV_RESTORE_REGS)
1048 : 1289 :
1049 : paravirt_type(pv_irq_ops.irq_disable), 1290 : paravirt_type(pv_irq_ops.irq_disable),
1050 paravirt_clobber(CLBR_EAX) 1291 paravirt_clobber(CLBR_EAX)
1051 : "memory", "eax", "cc"); 1292 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1052} 1293}
1053 1294
1054static inline void raw_local_irq_enable(void) 1295static inline void raw_local_irq_enable(void)
1055{ 1296{
1056 asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" 1297 asm volatile(paravirt_alt(PV_SAVE_REGS
1057 PARAVIRT_CALL 1298 PARAVIRT_CALL
1058 "popl %%edx; popl %%ecx") 1299 PV_RESTORE_REGS)
1059 : 1300 :
1060 : paravirt_type(pv_irq_ops.irq_enable), 1301 : paravirt_type(pv_irq_ops.irq_enable),
1061 paravirt_clobber(CLBR_EAX) 1302 paravirt_clobber(CLBR_EAX)
1062 : "memory", "eax", "cc"); 1303 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1063} 1304}
1064 1305
1065static inline unsigned long __raw_local_irq_save(void) 1306static inline unsigned long __raw_local_irq_save(void)
@@ -1071,27 +1312,6 @@ static inline unsigned long __raw_local_irq_save(void)
1071 return f; 1312 return f;
1072} 1313}
1073 1314
1074#define CLI_STRING \
1075 _paravirt_alt("pushl %%ecx; pushl %%edx;" \
1076 "call *%[paravirt_cli_opptr];" \
1077 "popl %%edx; popl %%ecx", \
1078 "%c[paravirt_cli_type]", "%c[paravirt_clobber]")
1079
1080#define STI_STRING \
1081 _paravirt_alt("pushl %%ecx; pushl %%edx;" \
1082 "call *%[paravirt_sti_opptr];" \
1083 "popl %%edx; popl %%ecx", \
1084 "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
1085
1086#define CLI_STI_CLOBBERS , "%eax"
1087#define CLI_STI_INPUT_ARGS \
1088 , \
1089 [paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)), \
1090 [paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable), \
1091 [paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)), \
1092 [paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable), \
1093 paravirt_clobber(CLBR_EAX)
1094
1095/* Make sure as little as possible of this mess escapes. */ 1315/* Make sure as little as possible of this mess escapes. */
1096#undef PARAVIRT_CALL 1316#undef PARAVIRT_CALL
1097#undef __PVOP_CALL 1317#undef __PVOP_CALL
@@ -1109,43 +1329,72 @@ static inline unsigned long __raw_local_irq_save(void)
1109 1329
1110#else /* __ASSEMBLY__ */ 1330#else /* __ASSEMBLY__ */
1111 1331
1112#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) 1332#define _PVSITE(ptype, clobbers, ops, word, algn) \
1113
1114#define PARA_SITE(ptype, clobbers, ops) \
1115771:; \ 1333771:; \
1116 ops; \ 1334 ops; \
1117772:; \ 1335772:; \
1118 .pushsection .parainstructions,"a"; \ 1336 .pushsection .parainstructions,"a"; \
1119 .long 771b; \ 1337 .align algn; \
1338 word 771b; \
1120 .byte ptype; \ 1339 .byte ptype; \
1121 .byte 772b-771b; \ 1340 .byte 772b-771b; \
1122 .short clobbers; \ 1341 .short clobbers; \
1123 .popsection 1342 .popsection
1124 1343
1344
1345#ifdef CONFIG_X86_64
1346#define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
1347#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
1348#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1349#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1350#else
1351#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
1352#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
1353#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1354#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1355#endif
1356
1125#define INTERRUPT_RETURN \ 1357#define INTERRUPT_RETURN \
1126 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ 1358 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
1127 jmp *%cs:pv_cpu_ops+PV_CPU_iret) 1359 jmp *%cs:pv_cpu_ops+PV_CPU_iret)
1128 1360
1129#define DISABLE_INTERRUPTS(clobbers) \ 1361#define DISABLE_INTERRUPTS(clobbers) \
1130 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ 1362 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1131 pushl %eax; pushl %ecx; pushl %edx; \ 1363 PV_SAVE_REGS; \
1132 call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \ 1364 call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \
1133 popl %edx; popl %ecx; popl %eax) \ 1365 PV_RESTORE_REGS;) \
1134 1366
1135#define ENABLE_INTERRUPTS(clobbers) \ 1367#define ENABLE_INTERRUPTS(clobbers) \
1136 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ 1368 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1137 pushl %eax; pushl %ecx; pushl %edx; \ 1369 PV_SAVE_REGS; \
1138 call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \ 1370 call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \
1139 popl %edx; popl %ecx; popl %eax) 1371 PV_RESTORE_REGS;)
1372
1373#define ENABLE_INTERRUPTS_SYSCALL_RET \
1374 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
1375 CLBR_NONE, \
1376 jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
1140 1377
1141#define ENABLE_INTERRUPTS_SYSEXIT \
1142 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
1143 jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
1144 1378
1379#ifdef CONFIG_X86_32
1145#define GET_CR0_INTO_EAX \ 1380#define GET_CR0_INTO_EAX \
1146 push %ecx; push %edx; \ 1381 push %ecx; push %edx; \
1147 call *pv_cpu_ops+PV_CPU_read_cr0; \ 1382 call *pv_cpu_ops+PV_CPU_read_cr0; \
1148 pop %edx; pop %ecx 1383 pop %edx; pop %ecx
1384#else
1385#define SWAPGS \
1386 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1387 PV_SAVE_REGS; \
1388 call *pv_cpu_ops+PV_CPU_swapgs; \
1389 PV_RESTORE_REGS \
1390 )
1391
1392#define GET_CR2_INTO_RCX \
1393 call *pv_mmu_ops+PV_MMU_read_cr2; \
1394 movq %rax, %rcx; \
1395 xorq %rax, %rax;
1396
1397#endif
1149 1398
1150#endif /* __ASSEMBLY__ */ 1399#endif /* __ASSEMBLY__ */
1151#endif /* CONFIG_PARAVIRT */ 1400#endif /* CONFIG_PARAVIRT */
diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h
index e88361966347..c61190cb9e12 100644
--- a/include/asm-x86/pci.h
+++ b/include/asm-x86/pci.h
@@ -66,6 +66,7 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
66 66
67 67
68#ifdef CONFIG_PCI 68#ifdef CONFIG_PCI
69extern void early_quirks(void);
69static inline void pci_dma_burst_advice(struct pci_dev *pdev, 70static inline void pci_dma_burst_advice(struct pci_dev *pdev,
70 enum pci_dma_burst_strategy *strat, 71 enum pci_dma_burst_strategy *strat,
71 unsigned long *strategy_parameter) 72 unsigned long *strategy_parameter)
@@ -73,9 +74,10 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
73 *strat = PCI_DMA_BURST_INFINITY; 74 *strat = PCI_DMA_BURST_INFINITY;
74 *strategy_parameter = ~0UL; 75 *strategy_parameter = ~0UL;
75} 76}
77#else
78static inline void early_quirks(void) { }
76#endif 79#endif
77 80
78
79#endif /* __KERNEL__ */ 81#endif /* __KERNEL__ */
80 82
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
@@ -90,6 +92,19 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
90/* generic pci stuff */ 92/* generic pci stuff */
91#include <asm-generic/pci.h> 93#include <asm-generic/pci.h>
92 94
95#ifdef CONFIG_NUMA
96/* Returns the node based on pci bus */
97static inline int __pcibus_to_node(struct pci_bus *bus)
98{
99 struct pci_sysdata *sd = bus->sysdata;
93 100
101 return sd->node;
102}
103
104static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
105{
106 return node_to_cpumask(__pcibus_to_node(bus));
107}
108#endif
94 109
95#endif 110#endif
diff --git a/include/asm-x86/pci_64.h b/include/asm-x86/pci_64.h
index ef54226a9325..374690314539 100644
--- a/include/asm-x86/pci_64.h
+++ b/include/asm-x86/pci_64.h
@@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int l
26 26
27 27
28extern void pci_iommu_alloc(void); 28extern void pci_iommu_alloc(void);
29extern int iommu_setup(char *opt);
30 29
31/* The PCI address space does equal the physical memory 30/* The PCI address space does equal the physical memory
32 * address space. The networking and block device layers use 31 * address space. The networking and block device layers use
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 35962bbe5e72..c0305bff0f19 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -7,22 +7,22 @@
7#include <linux/cache.h> 7#include <linux/cache.h>
8#include <asm/page.h> 8#include <asm/page.h>
9 9
10/* Per processor datastructure. %gs points to it while the kernel runs */ 10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda { 11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */ 12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker 13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */ 14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */ 15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */ 16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts with -1 */ 17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 int cpunumber; /* 36 Logical CPU number */ 18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR 19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */ 20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at 21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */ 22 offset 40!!! */
23#endif 23#endif
24 char *irqstackptr; 24 char *irqstackptr;
25 int nodenumber; /* number of current node */ 25 unsigned int nodenumber; /* number of current node */
26 unsigned int __softirq_pending; 26 unsigned int __softirq_pending;
27 unsigned int __nmi_count; /* number of NMI on this CPUs */ 27 unsigned int __nmi_count; /* number of NMI on this CPUs */
28 short mmu_state; 28 short mmu_state;
@@ -40,13 +40,14 @@ struct x8664_pda {
40 40
41extern struct x8664_pda *_cpu_pda[]; 41extern struct x8664_pda *_cpu_pda[];
42extern struct x8664_pda boot_cpu_pda[]; 42extern struct x8664_pda boot_cpu_pda[];
43extern void pda_init(int);
43 44
44#define cpu_pda(i) (_cpu_pda[i]) 45#define cpu_pda(i) (_cpu_pda[i])
45 46
46/* 47/*
47 * There is no fast way to get the base address of the PDA, all the accesses 48 * There is no fast way to get the base address of the PDA, all the accesses
48 * have to mention %fs/%gs. So it needs to be done this Torvaldian way. 49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
49 */ 50 */
50extern void __bad_pda_field(void) __attribute__((noreturn)); 51extern void __bad_pda_field(void) __attribute__((noreturn));
51 52
52/* 53/*
@@ -57,70 +58,70 @@ extern struct x8664_pda _proxy_pda;
57 58
58#define pda_offset(field) offsetof(struct x8664_pda, field) 59#define pda_offset(field) offsetof(struct x8664_pda, field)
59 60
60#define pda_to_op(op,field,val) do { \ 61#define pda_to_op(op, field, val) do { \
61 typedef typeof(_proxy_pda.field) T__; \ 62 typedef typeof(_proxy_pda.field) T__; \
62 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \ 63 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
63 switch (sizeof(_proxy_pda.field)) { \ 64 switch (sizeof(_proxy_pda.field)) { \
64 case 2: \ 65 case 2: \
65 asm(op "w %1,%%gs:%c2" : \ 66 asm(op "w %1,%%gs:%c2" : \
66 "+m" (_proxy_pda.field) : \ 67 "+m" (_proxy_pda.field) : \
67 "ri" ((T__)val), \ 68 "ri" ((T__)val), \
68 "i"(pda_offset(field))); \ 69 "i"(pda_offset(field))); \
69 break; \ 70 break; \
70 case 4: \ 71 case 4: \
71 asm(op "l %1,%%gs:%c2" : \ 72 asm(op "l %1,%%gs:%c2" : \
72 "+m" (_proxy_pda.field) : \ 73 "+m" (_proxy_pda.field) : \
73 "ri" ((T__)val), \ 74 "ri" ((T__)val), \
74 "i" (pda_offset(field))); \ 75 "i" (pda_offset(field))); \
75 break; \ 76 break; \
76 case 8: \ 77 case 8: \
77 asm(op "q %1,%%gs:%c2": \ 78 asm(op "q %1,%%gs:%c2": \
78 "+m" (_proxy_pda.field) : \ 79 "+m" (_proxy_pda.field) : \
79 "ri" ((T__)val), \ 80 "ri" ((T__)val), \
80 "i"(pda_offset(field))); \ 81 "i"(pda_offset(field))); \
81 break; \ 82 break; \
82 default: \ 83 default: \
83 __bad_pda_field(); \ 84 __bad_pda_field(); \
84 } \ 85 } \
85 } while (0) 86 } while (0)
86 87
87#define pda_from_op(op,field) ({ \ 88#define pda_from_op(op,field) ({ \
88 typeof(_proxy_pda.field) ret__; \ 89 typeof(_proxy_pda.field) ret__; \
89 switch (sizeof(_proxy_pda.field)) { \ 90 switch (sizeof(_proxy_pda.field)) { \
90 case 2: \ 91 case 2: \
91 asm(op "w %%gs:%c1,%0" : \ 92 asm(op "w %%gs:%c1,%0" : \
92 "=r" (ret__) : \ 93 "=r" (ret__) : \
93 "i" (pda_offset(field)), \ 94 "i" (pda_offset(field)), \
94 "m" (_proxy_pda.field)); \ 95 "m" (_proxy_pda.field)); \
95 break; \ 96 break; \
96 case 4: \ 97 case 4: \
97 asm(op "l %%gs:%c1,%0": \ 98 asm(op "l %%gs:%c1,%0": \
98 "=r" (ret__): \ 99 "=r" (ret__): \
99 "i" (pda_offset(field)), \ 100 "i" (pda_offset(field)), \
100 "m" (_proxy_pda.field)); \ 101 "m" (_proxy_pda.field)); \
101 break; \ 102 break; \
102 case 8: \ 103 case 8: \
103 asm(op "q %%gs:%c1,%0": \ 104 asm(op "q %%gs:%c1,%0": \
104 "=r" (ret__) : \ 105 "=r" (ret__) : \
105 "i" (pda_offset(field)), \ 106 "i" (pda_offset(field)), \
106 "m" (_proxy_pda.field)); \ 107 "m" (_proxy_pda.field)); \
107 break; \ 108 break; \
108 default: \ 109 default: \
109 __bad_pda_field(); \ 110 __bad_pda_field(); \
110 } \ 111 } \
111 ret__; }) 112 ret__; })
112 113
113#define read_pda(field) pda_from_op("mov",field) 114#define read_pda(field) pda_from_op("mov", field)
114#define write_pda(field,val) pda_to_op("mov",field,val) 115#define write_pda(field, val) pda_to_op("mov", field, val)
115#define add_pda(field,val) pda_to_op("add",field,val) 116#define add_pda(field, val) pda_to_op("add", field, val)
116#define sub_pda(field,val) pda_to_op("sub",field,val) 117#define sub_pda(field, val) pda_to_op("sub", field, val)
117#define or_pda(field,val) pda_to_op("or",field,val) 118#define or_pda(field, val) pda_to_op("or", field, val)
118 119
119/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 120/* This is not atomic against other CPUs -- CPU preemption needs to be off */
120#define test_and_clear_bit_pda(bit,field) ({ \ 121#define test_and_clear_bit_pda(bit, field) ({ \
121 int old__; \ 122 int old__; \
122 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \ 123 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
123 : "=r" (old__), "+m" (_proxy_pda.field) \ 124 : "=r" (old__), "+m" (_proxy_pda.field) \
124 : "dIr" (bit), "i" (pda_offset(field)) : "memory"); \ 125 : "dIr" (bit), "i" (pda_offset(field)) : "memory"); \
125 old__; \ 126 old__; \
126}) 127})
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index a1aaad274cca..0dec00f27eb4 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -1,5 +1,142 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_PERCPU_H_
2# include "percpu_32.h" 2#define _ASM_X86_PERCPU_H_
3#else 3
4# include "percpu_64.h" 4#ifdef CONFIG_X86_64
5#include <linux/compiler.h>
6
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset
8 in the PDA. Longer term the PDA and every per cpu variable
9 should be just put into a single section and referenced directly
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
5#endif 20#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25#else /* CONFIG_X86_64 */
26
27#ifdef __ASSEMBLY__
28
29/*
30 * PER_CPU finds an address of a per-cpu variable.
31 *
32 * Args:
33 * var - variable name
34 * reg - 32bit register
35 *
36 * The resulting address is stored in the "reg" argument.
37 *
38 * Example:
39 * PER_CPU(cpu_gdt_descr, %ebx)
40 */
41#ifdef CONFIG_SMP
42#define PER_CPU(var, reg) \
43 movl %fs:per_cpu__##this_cpu_off, reg; \
44 lea per_cpu__##var(reg), reg
45#define PER_CPU_VAR(var) %fs:per_cpu__##var
46#else /* ! SMP */
47#define PER_CPU(var, reg) \
48 movl $per_cpu__##var, reg
49#define PER_CPU_VAR(var) per_cpu__##var
50#endif /* SMP */
51
52#else /* ...!ASSEMBLY */
53
54/*
55 * PER_CPU finds an address of a per-cpu variable.
56 *
57 * Args:
58 * var - variable name
59 * cpu - 32bit register containing the current CPU number
60 *
61 * The resulting address is stored in the "cpu" argument.
62 *
63 * Example:
64 * PER_CPU(cpu_gdt_descr, %ebx)
65 */
66#ifdef CONFIG_SMP
67
68#define __my_cpu_offset x86_read_percpu(this_cpu_off)
69
70/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
71#define __percpu_seg "%%fs:"
72
73#else /* !SMP */
74
75#define __percpu_seg ""
76
77#endif /* SMP */
78
79#include <asm-generic/percpu.h>
80
81/* We can use this directly for local CPU (faster). */
82DECLARE_PER_CPU(unsigned long, this_cpu_off);
83
84/* For arch-specific code, we can use direct single-insn ops (they
85 * don't give an lvalue though). */
86extern void __bad_percpu_size(void);
87
88#define percpu_to_op(op,var,val) \
89 do { \
90 typedef typeof(var) T__; \
91 if (0) { T__ tmp__; tmp__ = (val); } \
92 switch (sizeof(var)) { \
93 case 1: \
94 asm(op "b %1,"__percpu_seg"%0" \
95 : "+m" (var) \
96 :"ri" ((T__)val)); \
97 break; \
98 case 2: \
99 asm(op "w %1,"__percpu_seg"%0" \
100 : "+m" (var) \
101 :"ri" ((T__)val)); \
102 break; \
103 case 4: \
104 asm(op "l %1,"__percpu_seg"%0" \
105 : "+m" (var) \
106 :"ri" ((T__)val)); \
107 break; \
108 default: __bad_percpu_size(); \
109 } \
110 } while (0)
111
112#define percpu_from_op(op,var) \
113 ({ \
114 typeof(var) ret__; \
115 switch (sizeof(var)) { \
116 case 1: \
117 asm(op "b "__percpu_seg"%1,%0" \
118 : "=r" (ret__) \
119 : "m" (var)); \
120 break; \
121 case 2: \
122 asm(op "w "__percpu_seg"%1,%0" \
123 : "=r" (ret__) \
124 : "m" (var)); \
125 break; \
126 case 4: \
127 asm(op "l "__percpu_seg"%1,%0" \
128 : "=r" (ret__) \
129 : "m" (var)); \
130 break; \
131 default: __bad_percpu_size(); \
132 } \
133 ret__; })
134
135#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
136#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
137#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
138#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
139#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
140#endif /* !__ASSEMBLY__ */
141#endif /* !CONFIG_X86_64 */
142#endif /* _ASM_X86_PERCPU_H_ */
diff --git a/include/asm-x86/percpu_32.h b/include/asm-x86/percpu_32.h
deleted file mode 100644
index a7ebd436f3cc..000000000000
--- a/include/asm-x86/percpu_32.h
+++ /dev/null
@@ -1,154 +0,0 @@
1#ifndef __ARCH_I386_PERCPU__
2#define __ARCH_I386_PERCPU__
3
4#ifdef __ASSEMBLY__
5
6/*
7 * PER_CPU finds an address of a per-cpu variable.
8 *
9 * Args:
10 * var - variable name
11 * reg - 32bit register
12 *
13 * The resulting address is stored in the "reg" argument.
14 *
15 * Example:
16 * PER_CPU(cpu_gdt_descr, %ebx)
17 */
18#ifdef CONFIG_SMP
19#define PER_CPU(var, reg) \
20 movl %fs:per_cpu__##this_cpu_off, reg; \
21 lea per_cpu__##var(reg), reg
22#define PER_CPU_VAR(var) %fs:per_cpu__##var
23#else /* ! SMP */
24#define PER_CPU(var, reg) \
25 movl $per_cpu__##var, reg
26#define PER_CPU_VAR(var) per_cpu__##var
27#endif /* SMP */
28
29#else /* ...!ASSEMBLY */
30
31/*
32 * PER_CPU finds an address of a per-cpu variable.
33 *
34 * Args:
35 * var - variable name
36 * cpu - 32bit register containing the current CPU number
37 *
38 * The resulting address is stored in the "cpu" argument.
39 *
40 * Example:
41 * PER_CPU(cpu_gdt_descr, %ebx)
42 */
43#ifdef CONFIG_SMP
44/* Same as generic implementation except for optimized local access. */
45#define __GENERIC_PER_CPU
46
47/* This is used for other cpus to find our section. */
48extern unsigned long __per_cpu_offset[];
49
50#define per_cpu_offset(x) (__per_cpu_offset[x])
51
52/* Separate out the type, so (int[3], foo) works. */
53#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
54#define DEFINE_PER_CPU(type, name) \
55 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
56
57#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
58 __attribute__((__section__(".data.percpu.shared_aligned"))) \
59 __typeof__(type) per_cpu__##name \
60 ____cacheline_aligned_in_smp
61
62/* We can use this directly for local CPU (faster). */
63DECLARE_PER_CPU(unsigned long, this_cpu_off);
64
65/* var is in discarded region: offset to particular copy we want */
66#define per_cpu(var, cpu) (*({ \
67 extern int simple_indentifier_##var(void); \
68 RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
69
70#define __raw_get_cpu_var(var) (*({ \
71 extern int simple_indentifier_##var(void); \
72 RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off)); \
73}))
74
75#define __get_cpu_var(var) __raw_get_cpu_var(var)
76
77/* A macro to avoid #include hell... */
78#define percpu_modcopy(pcpudst, src, size) \
79do { \
80 unsigned int __i; \
81 for_each_possible_cpu(__i) \
82 memcpy((pcpudst)+__per_cpu_offset[__i], \
83 (src), (size)); \
84} while (0)
85
86#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
87#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
88
89/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
90#define __percpu_seg "%%fs:"
91#else /* !SMP */
92#include <asm-generic/percpu.h>
93#define __percpu_seg ""
94#endif /* SMP */
95
96/* For arch-specific code, we can use direct single-insn ops (they
97 * don't give an lvalue though). */
98extern void __bad_percpu_size(void);
99
100#define percpu_to_op(op,var,val) \
101 do { \
102 typedef typeof(var) T__; \
103 if (0) { T__ tmp__; tmp__ = (val); } \
104 switch (sizeof(var)) { \
105 case 1: \
106 asm(op "b %1,"__percpu_seg"%0" \
107 : "+m" (var) \
108 :"ri" ((T__)val)); \
109 break; \
110 case 2: \
111 asm(op "w %1,"__percpu_seg"%0" \
112 : "+m" (var) \
113 :"ri" ((T__)val)); \
114 break; \
115 case 4: \
116 asm(op "l %1,"__percpu_seg"%0" \
117 : "+m" (var) \
118 :"ri" ((T__)val)); \
119 break; \
120 default: __bad_percpu_size(); \
121 } \
122 } while (0)
123
124#define percpu_from_op(op,var) \
125 ({ \
126 typeof(var) ret__; \
127 switch (sizeof(var)) { \
128 case 1: \
129 asm(op "b "__percpu_seg"%1,%0" \
130 : "=r" (ret__) \
131 : "m" (var)); \
132 break; \
133 case 2: \
134 asm(op "w "__percpu_seg"%1,%0" \
135 : "=r" (ret__) \
136 : "m" (var)); \
137 break; \
138 case 4: \
139 asm(op "l "__percpu_seg"%1,%0" \
140 : "=r" (ret__) \
141 : "m" (var)); \
142 break; \
143 default: __bad_percpu_size(); \
144 } \
145 ret__; })
146
147#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
148#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
149#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
150#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
151#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
152#endif /* !__ASSEMBLY__ */
153
154#endif /* __ARCH_I386_PERCPU__ */
diff --git a/include/asm-x86/percpu_64.h b/include/asm-x86/percpu_64.h
deleted file mode 100644
index 5abd48270101..000000000000
--- a/include/asm-x86/percpu_64.h
+++ /dev/null
@@ -1,68 +0,0 @@
1#ifndef _ASM_X8664_PERCPU_H_
2#define _ASM_X8664_PERCPU_H_
3#include <linux/compiler.h>
4
5/* Same as asm-generic/percpu.h, except that we store the per cpu offset
6 in the PDA. Longer term the PDA and every per cpu variable
7 should be just put into a single section and referenced directly
8 from %gs */
9
10#ifdef CONFIG_SMP
11
12#include <asm/pda.h>
13
14#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
15#define __my_cpu_offset() read_pda(data_offset)
16
17#define per_cpu_offset(x) (__per_cpu_offset(x))
18
19/* Separate out the type, so (int[3], foo) works. */
20#define DEFINE_PER_CPU(type, name) \
21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
22
23#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
24 __attribute__((__section__(".data.percpu.shared_aligned"))) \
25 __typeof__(type) per_cpu__##name \
26 ____cacheline_internodealigned_in_smp
27
28/* var is in discarded region: offset to particular copy we want */
29#define per_cpu(var, cpu) (*({ \
30 extern int simple_identifier_##var(void); \
31 RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)); }))
32#define __get_cpu_var(var) (*({ \
33 extern int simple_identifier_##var(void); \
34 RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
35#define __raw_get_cpu_var(var) (*({ \
36 extern int simple_identifier_##var(void); \
37 RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
38
39/* A macro to avoid #include hell... */
40#define percpu_modcopy(pcpudst, src, size) \
41do { \
42 unsigned int __i; \
43 for_each_possible_cpu(__i) \
44 memcpy((pcpudst)+__per_cpu_offset(__i), \
45 (src), (size)); \
46} while (0)
47
48extern void setup_per_cpu_areas(void);
49
50#else /* ! SMP */
51
52#define DEFINE_PER_CPU(type, name) \
53 __typeof__(type) per_cpu__##name
54#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
55 DEFINE_PER_CPU(type, name)
56
57#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
58#define __get_cpu_var(var) per_cpu__##var
59#define __raw_get_cpu_var(var) per_cpu__##var
60
61#endif /* SMP */
62
63#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
64
65#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
66#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
67
68#endif /* _ASM_X8664_PERCPU_H_ */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index f2fc33ceb9f2..10c2b452e64c 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -3,31 +3,33 @@
3 3
4#include <linux/threads.h> 4#include <linux/threads.h>
5#include <linux/mm.h> /* for struct page */ 5#include <linux/mm.h> /* for struct page */
6#include <asm/tlb.h>
7#include <asm-generic/tlb.h>
6 8
7#ifdef CONFIG_PARAVIRT 9#ifdef CONFIG_PARAVIRT
8#include <asm/paravirt.h> 10#include <asm/paravirt.h>
9#else 11#else
10#define paravirt_alloc_pt(mm, pfn) do { } while (0) 12#define paravirt_alloc_pt(mm, pfn) do { } while (0)
11#define paravirt_alloc_pd(pfn) do { } while (0) 13#define paravirt_alloc_pd(mm, pfn) do { } while (0)
12#define paravirt_alloc_pd(pfn) do { } while (0)
13#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) 14#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
14#define paravirt_release_pt(pfn) do { } while (0) 15#define paravirt_release_pt(pfn) do { } while (0)
15#define paravirt_release_pd(pfn) do { } while (0) 16#define paravirt_release_pd(pfn) do { } while (0)
16#endif 17#endif
17 18
18#define pmd_populate_kernel(mm, pmd, pte) \ 19static inline void pmd_populate_kernel(struct mm_struct *mm,
19do { \ 20 pmd_t *pmd, pte_t *pte)
20 paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \ 21{
21 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ 22 paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
22} while (0) 23 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
24}
25
26static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
27{
28 unsigned long pfn = page_to_pfn(pte);
23 29
24#define pmd_populate(mm, pmd, pte) \ 30 paravirt_alloc_pt(mm, pfn);
25do { \ 31 set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
26 paravirt_alloc_pt(mm, page_to_pfn(pte)); \ 32}
27 set_pmd(pmd, __pmd(_PAGE_TABLE + \
28 ((unsigned long long)page_to_pfn(pte) << \
29 (unsigned long long) PAGE_SHIFT))); \
30} while (0)
31 33
32/* 34/*
33 * Allocate and free page tables. 35 * Allocate and free page tables.
@@ -49,20 +51,55 @@ static inline void pte_free(struct page *pte)
49} 51}
50 52
51 53
52#define __pte_free_tlb(tlb,pte) \ 54static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
53do { \ 55{
54 paravirt_release_pt(page_to_pfn(pte)); \ 56 paravirt_release_pt(page_to_pfn(pte));
55 tlb_remove_page((tlb),(pte)); \ 57 tlb_remove_page(tlb, pte);
56} while (0) 58}
57 59
58#ifdef CONFIG_X86_PAE 60#ifdef CONFIG_X86_PAE
59/* 61/*
60 * In the PAE case we free the pmds as part of the pgd. 62 * In the PAE case we free the pmds as part of the pgd.
61 */ 63 */
62#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) 64static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
63#define pmd_free(x) do { } while (0) 65{
64#define __pmd_free_tlb(tlb,x) do { } while (0) 66 return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
65#define pud_populate(mm, pmd, pte) BUG() 67}
66#endif 68
69static inline void pmd_free(pmd_t *pmd)
70{
71 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
72 free_page((unsigned long)pmd);
73}
74
75static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
76{
77 /* This is called just after the pmd has been detached from
78 the pgd, which requires a full tlb flush to be recognized
79 by the CPU. Rather than incurring multiple tlb flushes
80 while the address space is being pulled down, make the tlb
81 gathering machinery do a full flush when we're done. */
82 tlb->fullmm = 1;
83
84 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
85 tlb_remove_page(tlb, virt_to_page(pmd));
86}
87
88static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
89{
90 paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
91
92 /* Note: almost everything apart from _PAGE_PRESENT is
93 reserved at the pmd (PDPT) level. */
94 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
95
96 /*
97 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
98 * the TLB via cr3 if the top-level pgd is changed...
99 */
100 if (mm == current->active_mm)
101 write_cr3(read_cr3());
102}
103#endif /* CONFIG_X86_PAE */
67 104
68#endif /* _I386_PGALLOC_H */ 105#endif /* _I386_PGALLOC_H */
diff --git a/include/asm-x86/pgtable-2level.h b/include/asm-x86/pgtable-2level.h
index 84b03cf56a79..701404fab308 100644
--- a/include/asm-x86/pgtable-2level.h
+++ b/include/asm-x86/pgtable-2level.h
@@ -15,30 +15,31 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
15{ 15{
16 *ptep = pte; 16 *ptep = pte;
17} 17}
18static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 18
19 pte_t *ptep , pte_t pte)
20{
21 native_set_pte(ptep, pte);
22}
23static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 19static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
24{ 20{
25 *pmdp = pmd; 21 *pmdp = pmd;
26} 22}
27#ifndef CONFIG_PARAVIRT
28#define set_pte(pteptr, pteval) native_set_pte(pteptr, pteval)
29#define set_pte_at(mm,addr,ptep,pteval) native_set_pte_at(mm, addr, ptep, pteval)
30#define set_pmd(pmdptr, pmdval) native_set_pmd(pmdptr, pmdval)
31#endif
32 23
33#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) 24static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
34#define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval) 25{
26 native_set_pte(ptep, pte);
27}
35 28
36#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) 29static inline void native_set_pte_present(struct mm_struct *mm, unsigned long addr,
37#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) 30 pte_t *ptep, pte_t pte)
31{
32 native_set_pte(ptep, pte);
33}
34
35static inline void native_pmd_clear(pmd_t *pmdp)
36{
37 native_set_pmd(pmdp, __pmd(0));
38}
38 39
39static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) 40static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
40{ 41{
41 *xp = __pte(0); 42 *xp = native_make_pte(0);
42} 43}
43 44
44#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
@@ -53,16 +54,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
53#define pte_page(x) pfn_to_page(pte_pfn(x)) 54#define pte_page(x) pfn_to_page(pte_pfn(x))
54#define pte_none(x) (!(x).pte_low) 55#define pte_none(x) (!(x).pte_low)
55#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) 56#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT)
56#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
57#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
58
59/*
60 * All present pages are kernel-executable:
61 */
62static inline int pte_exec_kernel(pte_t pte)
63{
64 return 1;
65}
66 57
67/* 58/*
68 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset 59 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
@@ -74,13 +65,13 @@ static inline int pte_exec_kernel(pte_t pte)
74 ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 )) 65 ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
75 66
76#define pgoff_to_pte(off) \ 67#define pgoff_to_pte(off) \
77 ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE }) 68 ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
78 69
79/* Encode and de-code a swap entry */ 70/* Encode and de-code a swap entry */
80#define __swp_type(x) (((x).val >> 1) & 0x1f) 71#define __swp_type(x) (((x).val >> 1) & 0x1f)
81#define __swp_offset(x) ((x).val >> 8) 72#define __swp_offset(x) ((x).val >> 8)
82#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) 73#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
83#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) 74#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
84#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 75#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
85 76
86#endif /* _I386_PGTABLE_2LEVEL_H */ 77#endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/include/asm-x86/pgtable-3level.h b/include/asm-x86/pgtable-3level.h
index 948a33414118..a195c3e757b9 100644
--- a/include/asm-x86/pgtable-3level.h
+++ b/include/asm-x86/pgtable-3level.h
@@ -15,16 +15,18 @@
15#define pgd_ERROR(e) \ 15#define pgd_ERROR(e) \
16 printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) 16 printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
17 17
18#define pud_none(pud) 0
19#define pud_bad(pud) 0
20#define pud_present(pud) 1
21 18
22/* 19static inline int pud_none(pud_t pud)
23 * All present pages with !NX bit are kernel-executable: 20{
24 */ 21 return pud_val(pud) == 0;
25static inline int pte_exec_kernel(pte_t pte) 22}
23static inline int pud_bad(pud_t pud)
24{
25 return (pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
26}
27static inline int pud_present(pud_t pud)
26{ 28{
27 return !(pte_val(pte) & _PAGE_NX); 29 return pud_val(pud) & _PAGE_PRESENT;
28} 30}
29 31
30/* Rules for using set_pte: the pte being assigned *must* be 32/* Rules for using set_pte: the pte being assigned *must* be
@@ -39,11 +41,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
39 smp_wmb(); 41 smp_wmb();
40 ptep->pte_low = pte.pte_low; 42 ptep->pte_low = pte.pte_low;
41} 43}
42static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
43 pte_t *ptep , pte_t pte)
44{
45 native_set_pte(ptep, pte);
46}
47 44
48/* 45/*
49 * Since this is only called on user PTEs, and the page fault handler 46 * Since this is only called on user PTEs, and the page fault handler
@@ -71,7 +68,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
71} 68}
72static inline void native_set_pud(pud_t *pudp, pud_t pud) 69static inline void native_set_pud(pud_t *pudp, pud_t pud)
73{ 70{
74 *pudp = pud; 71 set_64bit((unsigned long long *)(pudp),native_pud_val(pud));
75} 72}
76 73
77/* 74/*
@@ -94,24 +91,29 @@ static inline void native_pmd_clear(pmd_t *pmd)
94 *(tmp + 1) = 0; 91 *(tmp + 1) = 0;
95} 92}
96 93
97#ifndef CONFIG_PARAVIRT 94static inline void pud_clear(pud_t *pudp)
98#define set_pte(ptep, pte) native_set_pte(ptep, pte) 95{
99#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 96 set_pud(pudp, __pud(0));
100#define set_pte_present(mm, addr, ptep, pte) native_set_pte_present(mm, addr, ptep, pte) 97
101#define set_pte_atomic(ptep, pte) native_set_pte_atomic(ptep, pte) 98 /*
102#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) 99 * In principle we need to do a cr3 reload here to make sure
103#define set_pud(pudp, pud) native_set_pud(pudp, pud) 100 * the processor recognizes the changed pgd. In practice, all
104#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) 101 * the places where pud_clear() gets called are followed by
105#define pmd_clear(pmd) native_pmd_clear(pmd) 102 * full tlb flushes anyway, so we can defer the cost here.
106#endif 103 *
107 104 * Specifically:
108/* 105 *
109 * Pentium-II erratum A13: in PAE mode we explicitly have to flush 106 * mm/memory.c:free_pmd_range() - immediately after the
110 * the TLB via cr3 if the top-level pgd is changed... 107 * pud_clear() it does a pmd_free_tlb(). We change the
111 * We do not let the generic code free and clear pgd entries due to 108 * mmu_gather structure to do a full tlb flush (which has the
112 * this erratum. 109 * effect of reloading cr3) when the pagetable free is
113 */ 110 * complete.
114static inline void pud_clear (pud_t * pud) { } 111 *
112 * arch/x86/mm/hugetlbpage.c:huge_pmd_unshare() - the call to
113 * this is followed by a flush_tlb_range, which on x86 does a
114 * full tlb flush.
115 */
116}
115 117
116#define pud_page(pud) \ 118#define pud_page(pud) \
117((struct page *) __va(pud_val(pud) & PAGE_MASK)) 119((struct page *) __va(pud_val(pud) & PAGE_MASK))
@@ -155,21 +157,7 @@ static inline int pte_none(pte_t pte)
155 157
156static inline unsigned long pte_pfn(pte_t pte) 158static inline unsigned long pte_pfn(pte_t pte)
157{ 159{
158 return pte_val(pte) >> PAGE_SHIFT; 160 return (pte_val(pte) & ~_PAGE_NX) >> PAGE_SHIFT;
159}
160
161extern unsigned long long __supported_pte_mask;
162
163static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
164{
165 return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
166 pgprot_val(pgprot)) & __supported_pte_mask);
167}
168
169static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
170{
171 return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
172 pgprot_val(pgprot)) & __supported_pte_mask);
173} 161}
174 162
175/* 163/*
@@ -177,7 +165,7 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
177 * put the 32 bits of offset into the high part. 165 * put the 32 bits of offset into the high part.
178 */ 166 */
179#define pte_to_pgoff(pte) ((pte).pte_high) 167#define pte_to_pgoff(pte) ((pte).pte_high)
180#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) 168#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
181#define PTE_FILE_MAX_BITS 32 169#define PTE_FILE_MAX_BITS 32
182 170
183/* Encode and de-code a swap entry */ 171/* Encode and de-code a swap entry */
@@ -185,8 +173,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
185#define __swp_offset(x) ((x).val >> 5) 173#define __swp_offset(x) ((x).val >> 5)
186#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) 174#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
187#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) 175#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
188#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) 176#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
189
190#define __pmd_free_tlb(tlb, x) do { } while (0)
191 177
192#endif /* _I386_PGTABLE_3LEVEL_H */ 178#endif /* _I386_PGTABLE_3LEVEL_H */
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 1039140652af..cd2524f07452 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -1,5 +1,364 @@
1#ifndef _ASM_X86_PGTABLE_H
2#define _ASM_X86_PGTABLE_H
3
4#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
5#define FIRST_USER_ADDRESS 0
6
7#define _PAGE_BIT_PRESENT 0
8#define _PAGE_BIT_RW 1
9#define _PAGE_BIT_USER 2
10#define _PAGE_BIT_PWT 3
11#define _PAGE_BIT_PCD 4
12#define _PAGE_BIT_ACCESSED 5
13#define _PAGE_BIT_DIRTY 6
14#define _PAGE_BIT_FILE 6
15#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
16#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
17#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
18#define _PAGE_BIT_UNUSED2 10
19#define _PAGE_BIT_UNUSED3 11
20#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
21
22/*
23 * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
24 * sign-extended value on 32-bit with all 1's in the upper word,
25 * which preserves the upper pte values on 64-bit ptes:
26 */
27#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
28#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
29#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
30#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
31#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
32#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
33#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
34#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
35#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
36#define _PAGE_UNUSED1 (_AC(1, L)<<_PAGE_BIT_UNUSED1)
37#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
38#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
39
40#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
41#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
42#else
43#define _PAGE_NX 0
44#endif
45
46/* If _PAGE_PRESENT is clear, we use these: */
47#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
48#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
49 pte_present gives true */
50
51#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
52#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
53
54#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
55
56#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
57#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
58
59#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
60#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
61#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
62#define PAGE_COPY PAGE_COPY_NOEXEC
63#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
64#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
65
66#ifdef CONFIG_X86_32
67#define _PAGE_KERNEL_EXEC \
68 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
69#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
70
71#ifndef __ASSEMBLY__
72extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
73#endif /* __ASSEMBLY__ */
74#else
75#define __PAGE_KERNEL_EXEC \
76 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
77#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
78#endif
79
80#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
81#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
82#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
83#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
84#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
85#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
86#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
87#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
88
89#ifdef CONFIG_X86_32
90# define MAKE_GLOBAL(x) __pgprot((x))
91#else
92# define MAKE_GLOBAL(x) __pgprot((x) | _PAGE_GLOBAL)
93#endif
94
95#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
96#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
97#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
98#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
99#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
100#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
101#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
102#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
103#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
104#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
105
106/* xwr */
107#define __P000 PAGE_NONE
108#define __P001 PAGE_READONLY
109#define __P010 PAGE_COPY
110#define __P011 PAGE_COPY
111#define __P100 PAGE_READONLY_EXEC
112#define __P101 PAGE_READONLY_EXEC
113#define __P110 PAGE_COPY_EXEC
114#define __P111 PAGE_COPY_EXEC
115
116#define __S000 PAGE_NONE
117#define __S001 PAGE_READONLY
118#define __S010 PAGE_SHARED
119#define __S011 PAGE_SHARED
120#define __S100 PAGE_READONLY_EXEC
121#define __S101 PAGE_READONLY_EXEC
122#define __S110 PAGE_SHARED_EXEC
123#define __S111 PAGE_SHARED_EXEC
124
125#ifndef __ASSEMBLY__
126
127/*
128 * ZERO_PAGE is a global shared page that is always zero: used
129 * for zero-mapped memory areas etc..
130 */
131extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
132#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
133
134extern spinlock_t pgd_lock;
135extern struct list_head pgd_list;
136
137/*
138 * The following only work if pte_present() is true.
139 * Undefined behaviour if not..
140 */
141static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
142static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
143static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; }
144static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
145static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; }
146static inline int pte_global(pte_t pte) { return pte_val(pte) & _PAGE_GLOBAL; }
147static inline int pte_exec(pte_t pte) { return !(pte_val(pte) & _PAGE_NX); }
148
149static inline int pmd_large(pmd_t pte) {
150 return (pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
151 (_PAGE_PSE|_PAGE_PRESENT);
152}
153
154static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
155static inline pte_t pte_mkold(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
156static inline pte_t pte_wrprotect(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_RW); }
157static inline pte_t pte_mkexec(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_NX); }
158static inline pte_t pte_mkdirty(pte_t pte) { return __pte(pte_val(pte) | _PAGE_DIRTY); }
159static inline pte_t pte_mkyoung(pte_t pte) { return __pte(pte_val(pte) | _PAGE_ACCESSED); }
160static inline pte_t pte_mkwrite(pte_t pte) { return __pte(pte_val(pte) | _PAGE_RW); }
161static inline pte_t pte_mkhuge(pte_t pte) { return __pte(pte_val(pte) | _PAGE_PSE); }
162static inline pte_t pte_clrhuge(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
163static inline pte_t pte_mkglobal(pte_t pte) { return __pte(pte_val(pte) | _PAGE_GLOBAL); }
164static inline pte_t pte_clrglobal(pte_t pte) { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_GLOBAL); }
165
166extern pteval_t __supported_pte_mask;
167
168static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
169{
170 return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
171 pgprot_val(pgprot)) & __supported_pte_mask);
172}
173
174static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
175{
176 return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
177 pgprot_val(pgprot)) & __supported_pte_mask);
178}
179
180static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
181{
182 pteval_t val = pte_val(pte);
183
184 /*
185 * Chop off the NX bit (if present), and add the NX portion of
186 * the newprot (if present):
187 */
188 val &= _PAGE_CHG_MASK & ~_PAGE_NX;
189 val |= pgprot_val(newprot) & __supported_pte_mask;
190
191 return __pte(val);
192}
193
194#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
195
196#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
197
198#ifdef CONFIG_PARAVIRT
199#include <asm/paravirt.h>
200#else /* !CONFIG_PARAVIRT */
201#define set_pte(ptep, pte) native_set_pte(ptep, pte)
202#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
203
204#define set_pte_present(mm, addr, ptep, pte) \
205 native_set_pte_present(mm, addr, ptep, pte)
206#define set_pte_atomic(ptep, pte) \
207 native_set_pte_atomic(ptep, pte)
208
209#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
210
211#ifndef __PAGETABLE_PUD_FOLDED
212#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
213#define pgd_clear(pgd) native_pgd_clear(pgd)
214#endif
215
216#ifndef set_pud
217# define set_pud(pudp, pud) native_set_pud(pudp, pud)
218#endif
219
220#ifndef __PAGETABLE_PMD_FOLDED
221#define pud_clear(pud) native_pud_clear(pud)
222#endif
223
224#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
225#define pmd_clear(pmd) native_pmd_clear(pmd)
226
227#define pte_update(mm, addr, ptep) do { } while (0)
228#define pte_update_defer(mm, addr, ptep) do { } while (0)
229#endif /* CONFIG_PARAVIRT */
230
231#endif /* __ASSEMBLY__ */
232
1#ifdef CONFIG_X86_32 233#ifdef CONFIG_X86_32
2# include "pgtable_32.h" 234# include "pgtable_32.h"
3#else 235#else
4# include "pgtable_64.h" 236# include "pgtable_64.h"
5#endif 237#endif
238
239#ifndef __ASSEMBLY__
240
241enum {
242 PG_LEVEL_NONE,
243 PG_LEVEL_4K,
244 PG_LEVEL_2M,
245 PG_LEVEL_1G,
246};
247
248/*
249 * Helper function that returns the kernel pagetable entry controlling
250 * the virtual address 'address'. NULL means no pagetable entry present.
251 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
252 * as a pte too.
253 */
254extern pte_t *lookup_address(unsigned long address, int *level);
255
256/* local pte updates need not use xchg for locking */
257static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
258{
259 pte_t res = *ptep;
260
261 /* Pure native function needs no input for mm, addr */
262 native_pte_clear(NULL, 0, ptep);
263 return res;
264}
265
266static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
267 pte_t *ptep , pte_t pte)
268{
269 native_set_pte(ptep, pte);
270}
271
272#ifndef CONFIG_PARAVIRT
273/*
274 * Rules for using pte_update - it must be called after any PTE update which
275 * has not been done using the set_pte / clear_pte interfaces. It is used by
276 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
277 * updates should either be sets, clears, or set_pte_atomic for P->P
278 * transitions, which means this hook should only be called for user PTEs.
279 * This hook implies a P->P protection or access change has taken place, which
280 * requires a subsequent TLB flush. The notification can optionally be delayed
281 * until the TLB flush event by using the pte_update_defer form of the
282 * interface, but care must be taken to assure that the flush happens while
283 * still holding the same page table lock so that the shadow and primary pages
284 * do not become out of sync on SMP.
285 */
286#define pte_update(mm, addr, ptep) do { } while (0)
287#define pte_update_defer(mm, addr, ptep) do { } while (0)
288#endif
289
290/*
291 * We only update the dirty/accessed state if we set
292 * the dirty bit by hand in the kernel, since the hardware
293 * will do the accessed bit for us, and we don't want to
294 * race with other CPU's that might be updating the dirty
295 * bit at the same time.
296 */
297#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
298#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
299({ \
300 int __changed = !pte_same(*(ptep), entry); \
301 if (__changed && dirty) { \
302 *ptep = entry; \
303 pte_update_defer((vma)->vm_mm, (address), (ptep)); \
304 flush_tlb_page(vma, address); \
305 } \
306 __changed; \
307})
308
309#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
310#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
311 int __ret = 0; \
312 if (pte_young(*(ptep))) \
313 __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
314 &(ptep)->pte); \
315 if (__ret) \
316 pte_update((vma)->vm_mm, addr, ptep); \
317 __ret; \
318})
319
320#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
321#define ptep_clear_flush_young(vma, address, ptep) \
322({ \
323 int __young; \
324 __young = ptep_test_and_clear_young((vma), (address), (ptep)); \
325 if (__young) \
326 flush_tlb_page(vma, address); \
327 __young; \
328})
329
330#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
331static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
332{
333 pte_t pte = native_ptep_get_and_clear(ptep);
334 pte_update(mm, addr, ptep);
335 return pte;
336}
337
338#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
339static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
340{
341 pte_t pte;
342 if (full) {
343 /*
344 * Full address destruction in progress; paravirt does not
345 * care about updates and native needs no locking
346 */
347 pte = native_local_ptep_get_and_clear(ptep);
348 } else {
349 pte = ptep_get_and_clear(mm, addr, ptep);
350 }
351 return pte;
352}
353
354#define __HAVE_ARCH_PTEP_SET_WRPROTECT
355static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
356{
357 clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
358 pte_update(mm, addr, ptep);
359}
360
361#include <asm-generic/pgtable.h>
362#endif /* __ASSEMBLY__ */
363
364#endif /* _ASM_X86_PGTABLE_H */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index ed3e70d8d04b..21e70fbf1dae 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -25,20 +25,11 @@
25struct mm_struct; 25struct mm_struct;
26struct vm_area_struct; 26struct vm_area_struct;
27 27
28/*
29 * ZERO_PAGE is a global shared page that is always zero: used
30 * for zero-mapped memory areas etc..
31 */
32#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
33extern unsigned long empty_zero_page[1024];
34extern pgd_t swapper_pg_dir[1024]; 28extern pgd_t swapper_pg_dir[1024];
35extern struct kmem_cache *pmd_cache; 29extern struct kmem_cache *pmd_cache;
36extern spinlock_t pgd_lock;
37extern struct page *pgd_list;
38void check_pgt_cache(void); 30void check_pgt_cache(void);
39 31
40void pmd_ctor(struct kmem_cache *, void *); 32static inline void pgtable_cache_init(void) {}
41void pgtable_cache_init(void);
42void paging_init(void); 33void paging_init(void);
43 34
44 35
@@ -58,9 +49,6 @@ void paging_init(void);
58#define PGDIR_SIZE (1UL << PGDIR_SHIFT) 49#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
59#define PGDIR_MASK (~(PGDIR_SIZE-1)) 50#define PGDIR_MASK (~(PGDIR_SIZE-1))
60 51
61#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
62#define FIRST_USER_ADDRESS 0
63
64#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 52#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
65#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) 53#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
66 54
@@ -85,113 +73,6 @@ void paging_init(void);
85#endif 73#endif
86 74
87/* 75/*
88 * _PAGE_PSE set in the page directory entry just means that
89 * the page directory entry points directly to a 4MB-aligned block of
90 * memory.
91 */
92#define _PAGE_BIT_PRESENT 0
93#define _PAGE_BIT_RW 1
94#define _PAGE_BIT_USER 2
95#define _PAGE_BIT_PWT 3
96#define _PAGE_BIT_PCD 4
97#define _PAGE_BIT_ACCESSED 5
98#define _PAGE_BIT_DIRTY 6
99#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
100#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
101#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
102#define _PAGE_BIT_UNUSED2 10
103#define _PAGE_BIT_UNUSED3 11
104#define _PAGE_BIT_NX 63
105
106#define _PAGE_PRESENT 0x001
107#define _PAGE_RW 0x002
108#define _PAGE_USER 0x004
109#define _PAGE_PWT 0x008
110#define _PAGE_PCD 0x010
111#define _PAGE_ACCESSED 0x020
112#define _PAGE_DIRTY 0x040
113#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
114#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
115#define _PAGE_UNUSED1 0x200 /* available for programmer */
116#define _PAGE_UNUSED2 0x400
117#define _PAGE_UNUSED3 0x800
118
119/* If _PAGE_PRESENT is clear, we use these: */
120#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
121#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
122 pte_present gives true */
123#ifdef CONFIG_X86_PAE
124#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
125#else
126#define _PAGE_NX 0
127#endif
128
129#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
130#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
131#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
132
133#define PAGE_NONE \
134 __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
135#define PAGE_SHARED \
136 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
137
138#define PAGE_SHARED_EXEC \
139 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
140#define PAGE_COPY_NOEXEC \
141 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
142#define PAGE_COPY_EXEC \
143 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
144#define PAGE_COPY \
145 PAGE_COPY_NOEXEC
146#define PAGE_READONLY \
147 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
148#define PAGE_READONLY_EXEC \
149 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
150
151#define _PAGE_KERNEL \
152 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
153#define _PAGE_KERNEL_EXEC \
154 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
155
156extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
157#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
158#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
159#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
160#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
161#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
162
163#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
164#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
165#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
166#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
167#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
168#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
169#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
170
171/*
172 * The i386 can't do page protection for execute, and considers that
173 * the same are read. Also, write permissions imply read permissions.
174 * This is the closest we can get..
175 */
176#define __P000 PAGE_NONE
177#define __P001 PAGE_READONLY
178#define __P010 PAGE_COPY
179#define __P011 PAGE_COPY
180#define __P100 PAGE_READONLY_EXEC
181#define __P101 PAGE_READONLY_EXEC
182#define __P110 PAGE_COPY_EXEC
183#define __P111 PAGE_COPY_EXEC
184
185#define __S000 PAGE_NONE
186#define __S001 PAGE_READONLY
187#define __S010 PAGE_SHARED
188#define __S011 PAGE_SHARED
189#define __S100 PAGE_READONLY_EXEC
190#define __S101 PAGE_READONLY_EXEC
191#define __S110 PAGE_SHARED_EXEC
192#define __S111 PAGE_SHARED_EXEC
193
194/*
195 * Define this if things work differently on an i386 and an i486: 76 * Define this if things work differently on an i386 and an i486:
196 * it will (on an i486) warn about kernel memory accesses that are 77 * it will (on an i486) warn about kernel memory accesses that are
197 * done without a 'access_ok(VERIFY_WRITE,..)' 78 * done without a 'access_ok(VERIFY_WRITE,..)'
@@ -211,133 +92,12 @@ extern unsigned long pg0[];
211 92
212#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) 93#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
213 94
214/*
215 * The following only work if pte_present() is true.
216 * Undefined behaviour if not..
217 */
218static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
219static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
220static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
221static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
222
223/*
224 * The following only works if pte_present() is not true.
225 */
226static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
227
228static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
229static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
230static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
231static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
232static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
233static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
234static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
235
236#ifdef CONFIG_X86_PAE 95#ifdef CONFIG_X86_PAE
237# include <asm/pgtable-3level.h> 96# include <asm/pgtable-3level.h>
238#else 97#else
239# include <asm/pgtable-2level.h> 98# include <asm/pgtable-2level.h>
240#endif 99#endif
241 100
242#ifndef CONFIG_PARAVIRT
243/*
244 * Rules for using pte_update - it must be called after any PTE update which
245 * has not been done using the set_pte / clear_pte interfaces. It is used by
246 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
247 * updates should either be sets, clears, or set_pte_atomic for P->P
248 * transitions, which means this hook should only be called for user PTEs.
249 * This hook implies a P->P protection or access change has taken place, which
250 * requires a subsequent TLB flush. The notification can optionally be delayed
251 * until the TLB flush event by using the pte_update_defer form of the
252 * interface, but care must be taken to assure that the flush happens while
253 * still holding the same page table lock so that the shadow and primary pages
254 * do not become out of sync on SMP.
255 */
256#define pte_update(mm, addr, ptep) do { } while (0)
257#define pte_update_defer(mm, addr, ptep) do { } while (0)
258#endif
259
260/* local pte updates need not use xchg for locking */
261static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
262{
263 pte_t res = *ptep;
264
265 /* Pure native function needs no input for mm, addr */
266 native_pte_clear(NULL, 0, ptep);
267 return res;
268}
269
270/*
271 * We only update the dirty/accessed state if we set
272 * the dirty bit by hand in the kernel, since the hardware
273 * will do the accessed bit for us, and we don't want to
274 * race with other CPU's that might be updating the dirty
275 * bit at the same time.
276 */
277#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
278#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
279({ \
280 int __changed = !pte_same(*(ptep), entry); \
281 if (__changed && dirty) { \
282 (ptep)->pte_low = (entry).pte_low; \
283 pte_update_defer((vma)->vm_mm, (address), (ptep)); \
284 flush_tlb_page(vma, address); \
285 } \
286 __changed; \
287})
288
289#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
290#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
291 int __ret = 0; \
292 if (pte_young(*(ptep))) \
293 __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
294 &(ptep)->pte_low); \
295 if (__ret) \
296 pte_update((vma)->vm_mm, addr, ptep); \
297 __ret; \
298})
299
300#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
301#define ptep_clear_flush_young(vma, address, ptep) \
302({ \
303 int __young; \
304 __young = ptep_test_and_clear_young((vma), (address), (ptep)); \
305 if (__young) \
306 flush_tlb_page(vma, address); \
307 __young; \
308})
309
310#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
311static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
312{
313 pte_t pte = native_ptep_get_and_clear(ptep);
314 pte_update(mm, addr, ptep);
315 return pte;
316}
317
318#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
319static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
320{
321 pte_t pte;
322 if (full) {
323 /*
324 * Full address destruction in progress; paravirt does not
325 * care about updates and native needs no locking
326 */
327 pte = native_local_ptep_get_and_clear(ptep);
328 } else {
329 pte = ptep_get_and_clear(mm, addr, ptep);
330 }
331 return pte;
332}
333
334#define __HAVE_ARCH_PTEP_SET_WRPROTECT
335static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
336{
337 clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
338 pte_update(mm, addr, ptep);
339}
340
341/* 101/*
342 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 102 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
343 * 103 *
@@ -367,25 +127,6 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
367 127
368#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 128#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
369 129
370static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
371{
372 pte.pte_low &= _PAGE_CHG_MASK;
373 pte.pte_low |= pgprot_val(newprot);
374#ifdef CONFIG_X86_PAE
375 /*
376 * Chop off the NX bit (if present), and add the NX portion of
377 * the newprot (if present):
378 */
379 pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
380 pte.pte_high |= (pgprot_val(newprot) >> 32) & \
381 (__supported_pte_mask >> 32);
382#endif
383 return pte;
384}
385
386#define pmd_large(pmd) \
387((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
388
389/* 130/*
390 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] 131 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
391 * 132 *
@@ -432,26 +173,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
432#define pmd_page_vaddr(pmd) \ 173#define pmd_page_vaddr(pmd) \
433 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) 174 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
434 175
435/*
436 * Helper function that returns the kernel pagetable entry controlling
437 * the virtual address 'address'. NULL means no pagetable entry present.
438 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
439 * as a pte too.
440 */
441extern pte_t *lookup_address(unsigned long address);
442
443/*
444 * Make a given kernel text page executable/non-executable.
445 * Returns the previous executability setting of that page (which
446 * is used to restore the previous state). Used by the SMP bootup code.
447 * NOTE: this is an __init function for security reasons.
448 */
449#ifdef CONFIG_X86_PAE
450 extern int set_kernel_exec(unsigned long vaddr, int enable);
451#else
452 static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
453#endif
454
455#if defined(CONFIG_HIGHPTE) 176#if defined(CONFIG_HIGHPTE)
456#define pte_offset_map(dir, address) \ 177#define pte_offset_map(dir, address) \
457 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) 178 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
@@ -497,13 +218,17 @@ static inline void paravirt_pagetable_setup_done(pgd_t *base)
497 218
498#endif /* !__ASSEMBLY__ */ 219#endif /* !__ASSEMBLY__ */
499 220
221/*
222 * kern_addr_valid() is (1) for FLATMEM and (0) for
223 * SPARSEMEM and DISCONTIGMEM
224 */
500#ifdef CONFIG_FLATMEM 225#ifdef CONFIG_FLATMEM
501#define kern_addr_valid(addr) (1) 226#define kern_addr_valid(addr) (1)
502#endif /* CONFIG_FLATMEM */ 227#else
228#define kern_addr_valid(kaddr) (0)
229#endif
503 230
504#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 231#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
505 remap_pfn_range(vma, vaddr, pfn, size, prot) 232 remap_pfn_range(vma, vaddr, pfn, size, prot)
506 233
507#include <asm-generic/pgtable.h>
508
509#endif /* _I386_PGTABLE_H */ 234#endif /* _I386_PGTABLE_H */
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 9b0ff477b39e..6e615a103c2f 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -17,22 +17,16 @@ extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512]; 17extern pud_t level3_ident_pgt[512];
18extern pmd_t level2_kernel_pgt[512]; 18extern pmd_t level2_kernel_pgt[512];
19extern pgd_t init_level4_pgt[]; 19extern pgd_t init_level4_pgt[];
20extern unsigned long __supported_pte_mask;
21 20
22#define swapper_pg_dir init_level4_pgt 21#define swapper_pg_dir init_level4_pgt
23 22
24extern void paging_init(void); 23extern void paging_init(void);
25extern void clear_kernel_mapping(unsigned long addr, unsigned long size); 24extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
26 25
27/*
28 * ZERO_PAGE is a global shared page that is always zero: used
29 * for zero-mapped memory areas etc..
30 */
31extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
32#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
33
34#endif /* !__ASSEMBLY__ */ 26#endif /* !__ASSEMBLY__ */
35 27
28#define SHARED_KERNEL_PMD 1
29
36/* 30/*
37 * PGDIR_SHIFT determines what a top-level page table entry can map 31 * PGDIR_SHIFT determines what a top-level page table entry can map
38 */ 32 */
@@ -71,57 +65,68 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
71#define pgd_none(x) (!pgd_val(x)) 65#define pgd_none(x) (!pgd_val(x))
72#define pud_none(x) (!pud_val(x)) 66#define pud_none(x) (!pud_val(x))
73 67
74static inline void set_pte(pte_t *dst, pte_t val) 68struct mm_struct;
69
70static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
71 pte_t *ptep)
72{
73 *ptep = native_make_pte(0);
74}
75
76static inline void native_set_pte(pte_t *ptep, pte_t pte)
75{ 77{
76 pte_val(*dst) = pte_val(val); 78 *ptep = pte;
77} 79}
78#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
79 80
80static inline void set_pmd(pmd_t *dst, pmd_t val) 81static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
81{ 82{
82 pmd_val(*dst) = pmd_val(val); 83 native_set_pte(ptep, pte);
83} 84}
84 85
85static inline void set_pud(pud_t *dst, pud_t val) 86static inline pte_t native_ptep_get_and_clear(pte_t *xp)
86{ 87{
87 pud_val(*dst) = pud_val(val); 88#ifdef CONFIG_SMP
89 return native_make_pte(xchg(&xp->pte, 0));
90#else
91 /* native_local_ptep_get_and_clear, but duplicated because of cyclic dependency */
92 pte_t ret = *xp;
93 native_pte_clear(NULL, 0, xp);
94 return ret;
95#endif
88} 96}
89 97
90static inline void pud_clear (pud_t *pud) 98static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
91{ 99{
92 set_pud(pud, __pud(0)); 100 *pmdp = pmd;
93} 101}
94 102
95static inline void set_pgd(pgd_t *dst, pgd_t val) 103static inline void native_pmd_clear(pmd_t *pmd)
96{ 104{
97 pgd_val(*dst) = pgd_val(val); 105 native_set_pmd(pmd, native_make_pmd(0));
98} 106}
99 107
100static inline void pgd_clear (pgd_t * pgd) 108static inline void native_set_pud(pud_t *pudp, pud_t pud)
101{ 109{
102 set_pgd(pgd, __pgd(0)); 110 *pudp = pud;
103} 111}
104 112
105#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte, 0)) 113static inline void native_pud_clear(pud_t *pud)
114{
115 native_set_pud(pud, native_make_pud(0));
116}
106 117
107struct mm_struct; 118static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
119{
120 *pgdp = pgd;
121}
108 122
109static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) 123static inline void native_pgd_clear(pgd_t * pgd)
110{ 124{
111 pte_t pte; 125 native_set_pgd(pgd, native_make_pgd(0));
112 if (full) {
113 pte = *ptep;
114 *ptep = __pte(0);
115 } else {
116 pte = ptep_get_and_clear(mm, addr, ptep);
117 }
118 return pte;
119} 126}
120 127
121#define pte_same(a, b) ((a).pte == (b).pte) 128#define pte_same(a, b) ((a).pte == (b).pte)
122 129
123#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
124
125#endif /* !__ASSEMBLY__ */ 130#endif /* !__ASSEMBLY__ */
126 131
127#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) 132#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
@@ -131,8 +136,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
131#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) 136#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
132#define PGDIR_MASK (~(PGDIR_SIZE-1)) 137#define PGDIR_MASK (~(PGDIR_SIZE-1))
133 138
134#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
135#define FIRST_USER_ADDRESS 0
136 139
137#define MAXMEM _AC(0x3fffffffffff, UL) 140#define MAXMEM _AC(0x3fffffffffff, UL)
138#define VMALLOC_START _AC(0xffffc20000000000, UL) 141#define VMALLOC_START _AC(0xffffc20000000000, UL)
@@ -142,91 +145,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
142#define MODULES_END _AC(0xfffffffffff00000, UL) 145#define MODULES_END _AC(0xfffffffffff00000, UL)
143#define MODULES_LEN (MODULES_END - MODULES_VADDR) 146#define MODULES_LEN (MODULES_END - MODULES_VADDR)
144 147
145#define _PAGE_BIT_PRESENT 0
146#define _PAGE_BIT_RW 1
147#define _PAGE_BIT_USER 2
148#define _PAGE_BIT_PWT 3
149#define _PAGE_BIT_PCD 4
150#define _PAGE_BIT_ACCESSED 5
151#define _PAGE_BIT_DIRTY 6
152#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
153#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
154#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
155
156#define _PAGE_PRESENT 0x001
157#define _PAGE_RW 0x002
158#define _PAGE_USER 0x004
159#define _PAGE_PWT 0x008
160#define _PAGE_PCD 0x010
161#define _PAGE_ACCESSED 0x020
162#define _PAGE_DIRTY 0x040
163#define _PAGE_PSE 0x080 /* 2MB page */
164#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
165#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
166
167#define _PAGE_PROTNONE 0x080 /* If not present */
168#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
169
170#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
171#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
172
173#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
174
175#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
176#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
177#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
178#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
179#define PAGE_COPY PAGE_COPY_NOEXEC
180#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
181#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
182#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
183#define __PAGE_KERNEL \
184 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
185#define __PAGE_KERNEL_EXEC \
186 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
187#define __PAGE_KERNEL_NOCACHE \
188 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX)
189#define __PAGE_KERNEL_RO \
190 (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
191#define __PAGE_KERNEL_VSYSCALL \
192 (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
193#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
194 (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
195#define __PAGE_KERNEL_LARGE \
196 (__PAGE_KERNEL | _PAGE_PSE)
197#define __PAGE_KERNEL_LARGE_EXEC \
198 (__PAGE_KERNEL_EXEC | _PAGE_PSE)
199
200#define MAKE_GLOBAL(x) __pgprot((x) | _PAGE_GLOBAL)
201
202#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
203#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
204#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
205#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
206#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
207#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
208#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
209#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
210
211/* xwr */
212#define __P000 PAGE_NONE
213#define __P001 PAGE_READONLY
214#define __P010 PAGE_COPY
215#define __P011 PAGE_COPY
216#define __P100 PAGE_READONLY_EXEC
217#define __P101 PAGE_READONLY_EXEC
218#define __P110 PAGE_COPY_EXEC
219#define __P111 PAGE_COPY_EXEC
220
221#define __S000 PAGE_NONE
222#define __S001 PAGE_READONLY
223#define __S010 PAGE_SHARED
224#define __S011 PAGE_SHARED
225#define __S100 PAGE_READONLY_EXEC
226#define __S101 PAGE_READONLY_EXEC
227#define __S110 PAGE_SHARED_EXEC
228#define __S111 PAGE_SHARED_EXEC
229
230#ifndef __ASSEMBLY__ 148#ifndef __ASSEMBLY__
231 149
232static inline unsigned long pgd_bad(pgd_t pgd) 150static inline unsigned long pgd_bad(pgd_t pgd)
@@ -246,66 +164,16 @@ static inline unsigned long pmd_bad(pmd_t pmd)
246 164
247#define pte_none(x) (!pte_val(x)) 165#define pte_none(x) (!pte_val(x))
248#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) 166#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
249#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
250 167
251#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this 168#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
252 right? */
253#define pte_page(x) pfn_to_page(pte_pfn(x)) 169#define pte_page(x) pfn_to_page(pte_pfn(x))
254#define pte_pfn(x) ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) 170#define pte_pfn(x) ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
255 171
256static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
257{
258 pte_t pte;
259 pte_val(pte) = (page_nr << PAGE_SHIFT);
260 pte_val(pte) |= pgprot_val(pgprot);
261 pte_val(pte) &= __supported_pte_mask;
262 return pte;
263}
264
265/*
266 * The following only work if pte_present() is true.
267 * Undefined behaviour if not..
268 */
269#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
270static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
271static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
272static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; }
273static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; }
274static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; }
275
276static inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
277static inline pte_t pte_mkold(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; }
278static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; }
279static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_NX)); return pte; }
280static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
281static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
282static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
283static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
284static inline pte_t pte_clrhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; }
285
286struct vm_area_struct;
287
288static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
289{
290 if (!pte_young(*ptep))
291 return 0;
292 return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
293}
294
295static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
296{
297 clear_bit(_PAGE_BIT_RW, &ptep->pte);
298}
299
300/* 172/*
301 * Macro to mark a page protection value as "uncacheable". 173 * Macro to mark a page protection value as "uncacheable".
302 */ 174 */
303#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) 175#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
304 176
305static inline int pmd_large(pmd_t pte) {
306 return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
307}
308
309 177
310/* 178/*
311 * Conversion functions: convert a page and protection to a page entry, 179 * Conversion functions: convert a page and protection to a page entry,
@@ -340,29 +208,18 @@ static inline int pmd_large(pmd_t pte) {
340 pmd_index(address)) 208 pmd_index(address))
341#define pmd_none(x) (!pmd_val(x)) 209#define pmd_none(x) (!pmd_val(x))
342#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 210#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
343#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
344#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) 211#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
345#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) 212#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
346 213
347#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) 214#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
348#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) 215#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
349#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT 216#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
350 217
351/* PTE - Level 1 access. */ 218/* PTE - Level 1 access. */
352 219
353/* page, protection -> pte */ 220/* page, protection -> pte */
354#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 221#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
355#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
356 222
357/* Change flags of a PTE */
358static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
359{
360 pte_val(pte) &= _PAGE_CHG_MASK;
361 pte_val(pte) |= pgprot_val(newprot);
362 pte_val(pte) &= __supported_pte_mask;
363 return pte;
364}
365
366#define pte_index(address) \ 223#define pte_index(address) \
367 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 224 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
368#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ 225#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
@@ -376,40 +233,20 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
376 233
377#define update_mmu_cache(vma,address,pte) do { } while (0) 234#define update_mmu_cache(vma,address,pte) do { } while (0)
378 235
379/* We only update the dirty/accessed state if we set
380 * the dirty bit by hand in the kernel, since the hardware
381 * will do the accessed bit for us, and we don't want to
382 * race with other CPU's that might be updating the dirty
383 * bit at the same time. */
384#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
385#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
386({ \
387 int __changed = !pte_same(*(__ptep), __entry); \
388 if (__changed && __dirty) { \
389 set_pte(__ptep, __entry); \
390 flush_tlb_page(__vma, __address); \
391 } \
392 __changed; \
393})
394
395/* Encode and de-code a swap entry */ 236/* Encode and de-code a swap entry */
396#define __swp_type(x) (((x).val >> 1) & 0x3f) 237#define __swp_type(x) (((x).val >> 1) & 0x3f)
397#define __swp_offset(x) ((x).val >> 8) 238#define __swp_offset(x) ((x).val >> 8)
398#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) 239#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
399#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) 240#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
400#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 241#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
401
402extern spinlock_t pgd_lock;
403extern struct list_head pgd_list;
404 242
405extern int kern_addr_valid(unsigned long addr); 243extern int kern_addr_valid(unsigned long addr);
406 244
407pte_t *lookup_address(unsigned long addr);
408
409#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 245#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
410 remap_pfn_range(vma, vaddr, pfn, size, prot) 246 remap_pfn_range(vma, vaddr, pfn, size, prot)
411 247
412#define HAVE_ARCH_UNMAPPED_AREA 248#define HAVE_ARCH_UNMAPPED_AREA
249#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
413 250
414#define pgtable_cache_init() do { } while (0) 251#define pgtable_cache_init() do { } while (0)
415#define check_pgt_cache() do { } while (0) 252#define check_pgt_cache() do { } while (0)
@@ -422,12 +259,7 @@ pte_t *lookup_address(unsigned long addr);
422#define kc_offset_to_vaddr(o) \ 259#define kc_offset_to_vaddr(o) \
423 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) 260 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
424 261
425#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
426#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
427#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
428#define __HAVE_ARCH_PTEP_SET_WRPROTECT
429#define __HAVE_ARCH_PTE_SAME 262#define __HAVE_ARCH_PTE_SAME
430#include <asm-generic/pgtable.h>
431#endif /* !__ASSEMBLY__ */ 263#endif /* !__ASSEMBLY__ */
432 264
433#endif /* _X86_64_PGTABLE_H */ 265#endif /* _X86_64_PGTABLE_H */
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 46e1c04e309c..ab4d0c2a3f8f 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -1,5 +1,842 @@
1#ifndef __ASM_X86_PROCESSOR_H
2#define __ASM_X86_PROCESSOR_H
3
4#include <asm/processor-flags.h>
5
6/* migration helpers, for KVM - will be removed in 2.6.25: */
7#include <asm/vm86.h>
8#define Xgt_desc_struct desc_ptr
9
10/* Forward declaration, a strange C thing */
11struct task_struct;
12struct mm_struct;
13
14#include <asm/vm86.h>
15#include <asm/math_emu.h>
16#include <asm/segment.h>
17#include <asm/types.h>
18#include <asm/sigcontext.h>
19#include <asm/current.h>
20#include <asm/cpufeature.h>
21#include <asm/system.h>
22#include <asm/page.h>
23#include <asm/percpu.h>
24#include <asm/msr.h>
25#include <asm/desc_defs.h>
26#include <asm/nops.h>
27#include <linux/personality.h>
28#include <linux/cpumask.h>
29#include <linux/cache.h>
30#include <linux/threads.h>
31#include <linux/init.h>
32
33/*
34 * Default implementation of macro that returns current
35 * instruction pointer ("program counter").
36 */
37static inline void *current_text_addr(void)
38{
39 void *pc;
40 asm volatile("mov $1f,%0\n1:":"=r" (pc));
41 return pc;
42}
43
44#ifdef CONFIG_X86_VSMP
45#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
46#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
47#else
48#define ARCH_MIN_TASKALIGN 16
49#define ARCH_MIN_MMSTRUCT_ALIGN 0
50#endif
51
52/*
53 * CPU type and hardware bug flags. Kept separately for each CPU.
54 * Members of this structure are referenced in head.S, so think twice
55 * before touching them. [mj]
56 */
57
58struct cpuinfo_x86 {
59 __u8 x86; /* CPU family */
60 __u8 x86_vendor; /* CPU vendor */
61 __u8 x86_model;
62 __u8 x86_mask;
63#ifdef CONFIG_X86_32
64 char wp_works_ok; /* It doesn't on 386's */
65 char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
66 char hard_math;
67 char rfu;
68 char fdiv_bug;
69 char f00f_bug;
70 char coma_bug;
71 char pad0;
72#else
73 /* number of 4K pages in DTLB/ITLB combined(in pages)*/
74 int x86_tlbsize;
75 __u8 x86_virt_bits, x86_phys_bits;
76 /* cpuid returned core id bits */
77 __u8 x86_coreid_bits;
78 /* Max extended CPUID function supported */
79 __u32 extended_cpuid_level;
80#endif
81 int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
82 __u32 x86_capability[NCAPINTS];
83 char x86_vendor_id[16];
84 char x86_model_id[64];
85 int x86_cache_size; /* in KB - valid for CPUS which support this
86 call */
87 int x86_cache_alignment; /* In bytes */
88 int x86_power;
89 unsigned long loops_per_jiffy;
90#ifdef CONFIG_SMP
91 cpumask_t llc_shared_map; /* cpus sharing the last level cache */
92#endif
93 u16 x86_max_cores; /* cpuid returned max cores value */
94 u16 apicid;
95 u16 x86_clflush_size;
96#ifdef CONFIG_SMP
97 u16 booted_cores; /* number of cores as seen by OS */
98 u16 phys_proc_id; /* Physical processor id. */
99 u16 cpu_core_id; /* Core id */
100 u16 cpu_index; /* index into per_cpu list */
101#endif
102} __attribute__((__aligned__(SMP_CACHE_BYTES)));
103
104#define X86_VENDOR_INTEL 0
105#define X86_VENDOR_CYRIX 1
106#define X86_VENDOR_AMD 2
107#define X86_VENDOR_UMC 3
108#define X86_VENDOR_NEXGEN 4
109#define X86_VENDOR_CENTAUR 5
110#define X86_VENDOR_TRANSMETA 7
111#define X86_VENDOR_NSC 8
112#define X86_VENDOR_NUM 9
113#define X86_VENDOR_UNKNOWN 0xff
114
115/*
116 * capabilities of CPUs
117 */
118extern struct cpuinfo_x86 boot_cpu_data;
119extern struct cpuinfo_x86 new_cpu_data;
120extern struct tss_struct doublefault_tss;
121extern __u32 cleared_cpu_caps[NCAPINTS];
122
123#ifdef CONFIG_SMP
124DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
125#define cpu_data(cpu) per_cpu(cpu_info, cpu)
126#define current_cpu_data cpu_data(smp_processor_id())
127#else
128#define cpu_data(cpu) boot_cpu_data
129#define current_cpu_data boot_cpu_data
130#endif
131
132void cpu_detect(struct cpuinfo_x86 *c);
133
134extern void identify_cpu(struct cpuinfo_x86 *);
135extern void identify_boot_cpu(void);
136extern void identify_secondary_cpu(struct cpuinfo_x86 *);
137extern void print_cpu_info(struct cpuinfo_x86 *);
138extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
139extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
140extern unsigned short num_cache_leaves;
141
142#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
143extern void detect_ht(struct cpuinfo_x86 *c);
144#else
145static inline void detect_ht(struct cpuinfo_x86 *c) {}
146#endif
147
148static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
149 unsigned int *ecx, unsigned int *edx)
150{
151 /* ecx is often an input as well as an output. */
152 __asm__("cpuid"
153 : "=a" (*eax),
154 "=b" (*ebx),
155 "=c" (*ecx),
156 "=d" (*edx)
157 : "0" (*eax), "2" (*ecx));
158}
159
160static inline void load_cr3(pgd_t *pgdir)
161{
162 write_cr3(__pa(pgdir));
163}
164
165#ifdef CONFIG_X86_32
166/* This is the TSS defined by the hardware. */
167struct x86_hw_tss {
168 unsigned short back_link, __blh;
169 unsigned long sp0;
170 unsigned short ss0, __ss0h;
171 unsigned long sp1;
172 unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
173 unsigned long sp2;
174 unsigned short ss2, __ss2h;
175 unsigned long __cr3;
176 unsigned long ip;
177 unsigned long flags;
178 unsigned long ax, cx, dx, bx;
179 unsigned long sp, bp, si, di;
180 unsigned short es, __esh;
181 unsigned short cs, __csh;
182 unsigned short ss, __ssh;
183 unsigned short ds, __dsh;
184 unsigned short fs, __fsh;
185 unsigned short gs, __gsh;
186 unsigned short ldt, __ldth;
187 unsigned short trace, io_bitmap_base;
188} __attribute__((packed));
189#else
190struct x86_hw_tss {
191 u32 reserved1;
192 u64 sp0;
193 u64 sp1;
194 u64 sp2;
195 u64 reserved2;
196 u64 ist[7];
197 u32 reserved3;
198 u32 reserved4;
199 u16 reserved5;
200 u16 io_bitmap_base;
201} __attribute__((packed)) ____cacheline_aligned;
202#endif
203
204/*
205 * Size of io_bitmap.
206 */
207#define IO_BITMAP_BITS 65536
208#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
209#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
210#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
211#define INVALID_IO_BITMAP_OFFSET 0x8000
212#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
213
214struct tss_struct {
215 struct x86_hw_tss x86_tss;
216
217 /*
218 * The extra 1 is there because the CPU will access an
219 * additional byte beyond the end of the IO permission
220 * bitmap. The extra byte must be all 1 bits, and must
221 * be within the limit.
222 */
223 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
224 /*
225 * Cache the current maximum and the last task that used the bitmap:
226 */
227 unsigned long io_bitmap_max;
228 struct thread_struct *io_bitmap_owner;
229 /*
230 * pads the TSS to be cacheline-aligned (size is 0x100)
231 */
232 unsigned long __cacheline_filler[35];
233 /*
234 * .. and then another 0x100 bytes for emergency kernel stack
235 */
236 unsigned long stack[64];
237} __attribute__((packed));
238
239DECLARE_PER_CPU(struct tss_struct, init_tss);
240
241/* Save the original ist values for checking stack pointers during debugging */
242struct orig_ist {
243 unsigned long ist[7];
244};
245
246#define MXCSR_DEFAULT 0x1f80
247
248struct i387_fsave_struct {
249 u32 cwd;
250 u32 swd;
251 u32 twd;
252 u32 fip;
253 u32 fcs;
254 u32 foo;
255 u32 fos;
256 u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
257 u32 status; /* software status information */
258};
259
260struct i387_fxsave_struct {
261 u16 cwd;
262 u16 swd;
263 u16 twd;
264 u16 fop;
265 union {
266 struct {
267 u64 rip;
268 u64 rdp;
269 };
270 struct {
271 u32 fip;
272 u32 fcs;
273 u32 foo;
274 u32 fos;
275 };
276 };
277 u32 mxcsr;
278 u32 mxcsr_mask;
279 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
280 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
281 u32 padding[24];
282} __attribute__((aligned(16)));
283
284struct i387_soft_struct {
285 u32 cwd;
286 u32 swd;
287 u32 twd;
288 u32 fip;
289 u32 fcs;
290 u32 foo;
291 u32 fos;
292 u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
293 u8 ftop, changed, lookahead, no_update, rm, alimit;
294 struct info *info;
295 u32 entry_eip;
296};
297
298union i387_union {
299 struct i387_fsave_struct fsave;
300 struct i387_fxsave_struct fxsave;
301 struct i387_soft_struct soft;
302};
303
304#ifdef CONFIG_X86_32
305/*
306 * the following now lives in the per cpu area:
307 * extern int cpu_llc_id[NR_CPUS];
308 */
309DECLARE_PER_CPU(u8, cpu_llc_id);
310#else
311DECLARE_PER_CPU(struct orig_ist, orig_ist);
312#endif
313
314extern void print_cpu_info(struct cpuinfo_x86 *);
315extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
316extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
317extern unsigned short num_cache_leaves;
318
319struct thread_struct {
320/* cached TLS descriptors. */
321 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
322 unsigned long sp0;
323 unsigned long sp;
324#ifdef CONFIG_X86_32
325 unsigned long sysenter_cs;
326#else
327 unsigned long usersp; /* Copy from PDA */
328 unsigned short es, ds, fsindex, gsindex;
329#endif
330 unsigned long ip;
331 unsigned long fs;
332 unsigned long gs;
333/* Hardware debugging registers */
334 unsigned long debugreg0;
335 unsigned long debugreg1;
336 unsigned long debugreg2;
337 unsigned long debugreg3;
338 unsigned long debugreg6;
339 unsigned long debugreg7;
340/* fault info */
341 unsigned long cr2, trap_no, error_code;
342/* floating point info */
343 union i387_union i387 __attribute__((aligned(16)));;
344#ifdef CONFIG_X86_32
345/* virtual 86 mode info */
346 struct vm86_struct __user *vm86_info;
347 unsigned long screen_bitmap;
348 unsigned long v86flags, v86mask, saved_sp0;
349 unsigned int saved_fs, saved_gs;
350#endif
351/* IO permissions */
352 unsigned long *io_bitmap_ptr;
353 unsigned long iopl;
354/* max allowed port in the bitmap, in bytes: */
355 unsigned io_bitmap_max;
356/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
357 unsigned long debugctlmsr;
358/* Debug Store - if not 0 points to a DS Save Area configuration;
359 * goes into MSR_IA32_DS_AREA */
360 unsigned long ds_area_msr;
361};
362
363static inline unsigned long native_get_debugreg(int regno)
364{
365 unsigned long val = 0; /* Damn you, gcc! */
366
367 switch (regno) {
368 case 0:
369 asm("mov %%db0, %0" :"=r" (val)); break;
370 case 1:
371 asm("mov %%db1, %0" :"=r" (val)); break;
372 case 2:
373 asm("mov %%db2, %0" :"=r" (val)); break;
374 case 3:
375 asm("mov %%db3, %0" :"=r" (val)); break;
376 case 6:
377 asm("mov %%db6, %0" :"=r" (val)); break;
378 case 7:
379 asm("mov %%db7, %0" :"=r" (val)); break;
380 default:
381 BUG();
382 }
383 return val;
384}
385
386static inline void native_set_debugreg(int regno, unsigned long value)
387{
388 switch (regno) {
389 case 0:
390 asm("mov %0,%%db0" : /* no output */ :"r" (value));
391 break;
392 case 1:
393 asm("mov %0,%%db1" : /* no output */ :"r" (value));
394 break;
395 case 2:
396 asm("mov %0,%%db2" : /* no output */ :"r" (value));
397 break;
398 case 3:
399 asm("mov %0,%%db3" : /* no output */ :"r" (value));
400 break;
401 case 6:
402 asm("mov %0,%%db6" : /* no output */ :"r" (value));
403 break;
404 case 7:
405 asm("mov %0,%%db7" : /* no output */ :"r" (value));
406 break;
407 default:
408 BUG();
409 }
410}
411
412/*
413 * Set IOPL bits in EFLAGS from given mask
414 */
415static inline void native_set_iopl_mask(unsigned mask)
416{
417#ifdef CONFIG_X86_32
418 unsigned int reg;
419 __asm__ __volatile__ ("pushfl;"
420 "popl %0;"
421 "andl %1, %0;"
422 "orl %2, %0;"
423 "pushl %0;"
424 "popfl"
425 : "=&r" (reg)
426 : "i" (~X86_EFLAGS_IOPL), "r" (mask));
427#endif
428}
429
430static inline void native_load_sp0(struct tss_struct *tss,
431 struct thread_struct *thread)
432{
433 tss->x86_tss.sp0 = thread->sp0;
434#ifdef CONFIG_X86_32
435 /* Only happens when SEP is enabled, no need to test "SEP"arately */
436 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
437 tss->x86_tss.ss1 = thread->sysenter_cs;
438 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
439 }
440#endif
441}
442
443static inline void native_swapgs(void)
444{
445#ifdef CONFIG_X86_64
446 asm volatile("swapgs" ::: "memory");
447#endif
448}
449
450#ifdef CONFIG_PARAVIRT
451#include <asm/paravirt.h>
452#else
453#define __cpuid native_cpuid
454#define paravirt_enabled() 0
455
456/*
457 * These special macros can be used to get or set a debugging register
458 */
459#define get_debugreg(var, register) \
460 (var) = native_get_debugreg(register)
461#define set_debugreg(value, register) \
462 native_set_debugreg(register, value)
463
464static inline void load_sp0(struct tss_struct *tss,
465 struct thread_struct *thread)
466{
467 native_load_sp0(tss, thread);
468}
469
470#define set_iopl_mask native_set_iopl_mask
471#define SWAPGS swapgs
472#endif /* CONFIG_PARAVIRT */
473
474/*
475 * Save the cr4 feature set we're using (ie
476 * Pentium 4MB enable and PPro Global page
477 * enable), so that any CPU's that boot up
478 * after us can get the correct flags.
479 */
480extern unsigned long mmu_cr4_features;
481
482static inline void set_in_cr4(unsigned long mask)
483{
484 unsigned cr4;
485 mmu_cr4_features |= mask;
486 cr4 = read_cr4();
487 cr4 |= mask;
488 write_cr4(cr4);
489}
490
491static inline void clear_in_cr4(unsigned long mask)
492{
493 unsigned cr4;
494 mmu_cr4_features &= ~mask;
495 cr4 = read_cr4();
496 cr4 &= ~mask;
497 write_cr4(cr4);
498}
499
500struct microcode_header {
501 unsigned int hdrver;
502 unsigned int rev;
503 unsigned int date;
504 unsigned int sig;
505 unsigned int cksum;
506 unsigned int ldrver;
507 unsigned int pf;
508 unsigned int datasize;
509 unsigned int totalsize;
510 unsigned int reserved[3];
511};
512
513struct microcode {
514 struct microcode_header hdr;
515 unsigned int bits[0];
516};
517
518typedef struct microcode microcode_t;
519typedef struct microcode_header microcode_header_t;
520
521/* microcode format is extended from prescott processors */
522struct extended_signature {
523 unsigned int sig;
524 unsigned int pf;
525 unsigned int cksum;
526};
527
528struct extended_sigtable {
529 unsigned int count;
530 unsigned int cksum;
531 unsigned int reserved[3];
532 struct extended_signature sigs[0];
533};
534
535typedef struct {
536 unsigned long seg;
537} mm_segment_t;
538
539
540/*
541 * create a kernel thread without removing it from tasklists
542 */
543extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
544
545/* Free all resources held by a thread. */
546extern void release_thread(struct task_struct *);
547
548/* Prepare to copy thread state - unlazy all lazy status */
549extern void prepare_to_copy(struct task_struct *tsk);
550
551unsigned long get_wchan(struct task_struct *p);
552
553/*
554 * Generic CPUID function
555 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
556 * resulting in stale register contents being returned.
557 */
558static inline void cpuid(unsigned int op,
559 unsigned int *eax, unsigned int *ebx,
560 unsigned int *ecx, unsigned int *edx)
561{
562 *eax = op;
563 *ecx = 0;
564 __cpuid(eax, ebx, ecx, edx);
565}
566
567/* Some CPUID calls want 'count' to be placed in ecx */
568static inline void cpuid_count(unsigned int op, int count,
569 unsigned int *eax, unsigned int *ebx,
570 unsigned int *ecx, unsigned int *edx)
571{
572 *eax = op;
573 *ecx = count;
574 __cpuid(eax, ebx, ecx, edx);
575}
576
577/*
578 * CPUID functions returning a single datum
579 */
580static inline unsigned int cpuid_eax(unsigned int op)
581{
582 unsigned int eax, ebx, ecx, edx;
583
584 cpuid(op, &eax, &ebx, &ecx, &edx);
585 return eax;
586}
587static inline unsigned int cpuid_ebx(unsigned int op)
588{
589 unsigned int eax, ebx, ecx, edx;
590
591 cpuid(op, &eax, &ebx, &ecx, &edx);
592 return ebx;
593}
594static inline unsigned int cpuid_ecx(unsigned int op)
595{
596 unsigned int eax, ebx, ecx, edx;
597
598 cpuid(op, &eax, &ebx, &ecx, &edx);
599 return ecx;
600}
601static inline unsigned int cpuid_edx(unsigned int op)
602{
603 unsigned int eax, ebx, ecx, edx;
604
605 cpuid(op, &eax, &ebx, &ecx, &edx);
606 return edx;
607}
608
609/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
610static inline void rep_nop(void)
611{
612 __asm__ __volatile__("rep;nop": : :"memory");
613}
614
615/* Stop speculative execution */
616static inline void sync_core(void)
617{
618 int tmp;
619 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
620 : "ebx", "ecx", "edx", "memory");
621}
622
623#define cpu_relax() rep_nop()
624
625static inline void __monitor(const void *eax, unsigned long ecx,
626 unsigned long edx)
627{
628 /* "monitor %eax,%ecx,%edx;" */
629 asm volatile(
630 ".byte 0x0f,0x01,0xc8;"
631 : :"a" (eax), "c" (ecx), "d"(edx));
632}
633
634static inline void __mwait(unsigned long eax, unsigned long ecx)
635{
636 /* "mwait %eax,%ecx;" */
637 asm volatile(
638 ".byte 0x0f,0x01,0xc9;"
639 : :"a" (eax), "c" (ecx));
640}
641
642static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
643{
644 /* "mwait %eax,%ecx;" */
645 asm volatile(
646 "sti; .byte 0x0f,0x01,0xc9;"
647 : :"a" (eax), "c" (ecx));
648}
649
650extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
651
652extern int force_mwait;
653
654extern void select_idle_routine(const struct cpuinfo_x86 *c);
655
656extern unsigned long boot_option_idle_override;
657
658extern void enable_sep_cpu(void);
659extern int sysenter_setup(void);
660
661/* Defined in head.S */
662extern struct desc_ptr early_gdt_descr;
663
664extern void cpu_set_gdt(int);
665extern void switch_to_new_gdt(void);
666extern void cpu_init(void);
667extern void init_gdt(int cpu);
668
669/* from system description table in BIOS. Mostly for MCA use, but
670 * others may find it useful. */
671extern unsigned int machine_id;
672extern unsigned int machine_submodel_id;
673extern unsigned int BIOS_revision;
674extern unsigned int mca_pentium_flag;
675
676/* Boot loader type from the setup header */
677extern int bootloader_type;
678
679extern char ignore_fpu_irq;
680#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
681
682#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
683#define ARCH_HAS_PREFETCHW
684#define ARCH_HAS_SPINLOCK_PREFETCH
685
686#ifdef CONFIG_X86_32
687#define BASE_PREFETCH ASM_NOP4
688#define ARCH_HAS_PREFETCH
689#else
690#define BASE_PREFETCH "prefetcht0 (%1)"
691#endif
692
693/* Prefetch instructions for Pentium III and AMD Athlon */
694/* It's not worth to care about 3dnow! prefetches for the K6
695 because they are microcoded there and very slow.
696 However we don't do prefetches for pre XP Athlons currently
697 That should be fixed. */
698static inline void prefetch(const void *x)
699{
700 alternative_input(BASE_PREFETCH,
701 "prefetchnta (%1)",
702 X86_FEATURE_XMM,
703 "r" (x));
704}
705
706/* 3dnow! prefetch to get an exclusive cache line. Useful for
707 spinlocks to avoid one state transition in the cache coherency protocol. */
708static inline void prefetchw(const void *x)
709{
710 alternative_input(BASE_PREFETCH,
711 "prefetchw (%1)",
712 X86_FEATURE_3DNOW,
713 "r" (x));
714}
715
716#define spin_lock_prefetch(x) prefetchw(x)
1#ifdef CONFIG_X86_32 717#ifdef CONFIG_X86_32
2# include "processor_32.h" 718/*
719 * User space process size: 3GB (default).
720 */
721#define TASK_SIZE (PAGE_OFFSET)
722
723#define INIT_THREAD { \
724 .sp0 = sizeof(init_stack) + (long)&init_stack, \
725 .vm86_info = NULL, \
726 .sysenter_cs = __KERNEL_CS, \
727 .io_bitmap_ptr = NULL, \
728 .fs = __KERNEL_PERCPU, \
729}
730
731/*
732 * Note that the .io_bitmap member must be extra-big. This is because
733 * the CPU will access an additional byte beyond the end of the IO
734 * permission bitmap. The extra byte must be all 1 bits, and must
735 * be within the limit.
736 */
737#define INIT_TSS { \
738 .x86_tss = { \
739 .sp0 = sizeof(init_stack) + (long)&init_stack, \
740 .ss0 = __KERNEL_DS, \
741 .ss1 = __KERNEL_CS, \
742 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
743 }, \
744 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
745}
746
747#define start_thread(regs, new_eip, new_esp) do { \
748 __asm__("movl %0,%%gs": :"r" (0)); \
749 regs->fs = 0; \
750 set_fs(USER_DS); \
751 regs->ds = __USER_DS; \
752 regs->es = __USER_DS; \
753 regs->ss = __USER_DS; \
754 regs->cs = __USER_CS; \
755 regs->ip = new_eip; \
756 regs->sp = new_esp; \
757} while (0)
758
759
760extern unsigned long thread_saved_pc(struct task_struct *tsk);
761
762#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
763#define KSTK_TOP(info) \
764({ \
765 unsigned long *__ptr = (unsigned long *)(info); \
766 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
767})
768
769/*
770 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
771 * This is necessary to guarantee that the entire "struct pt_regs"
772 * is accessable even if the CPU haven't stored the SS/ESP registers
773 * on the stack (interrupt gate does not save these registers
774 * when switching to the same priv ring).
775 * Therefore beware: accessing the ss/esp fields of the
776 * "struct pt_regs" is possible, but they may contain the
777 * completely wrong values.
778 */
779#define task_pt_regs(task) \
780({ \
781 struct pt_regs *__regs__; \
782 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
783 __regs__ - 1; \
784})
785
786#define KSTK_ESP(task) (task_pt_regs(task)->sp)
787
3#else 788#else
4# include "processor_64.h" 789/*
790 * User space process size. 47bits minus one guard page.
791 */
792#define TASK_SIZE64 (0x800000000000UL - 4096)
793
794/* This decides where the kernel will search for a free chunk of vm
795 * space during mmap's.
796 */
797#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
798 0xc0000000 : 0xFFFFe000)
799
800#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
801 IA32_PAGE_OFFSET : TASK_SIZE64)
802#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
803 IA32_PAGE_OFFSET : TASK_SIZE64)
804
805#define INIT_THREAD { \
806 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
807}
808
809#define INIT_TSS { \
810 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
811}
812
813#define start_thread(regs, new_rip, new_rsp) do { \
814 asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
815 load_gs_index(0); \
816 (regs)->ip = (new_rip); \
817 (regs)->sp = (new_rsp); \
818 write_pda(oldrsp, (new_rsp)); \
819 (regs)->cs = __USER_CS; \
820 (regs)->ss = __USER_DS; \
821 (regs)->flags = 0x200; \
822 set_fs(USER_DS); \
823} while (0)
824
825/*
826 * Return saved PC of a blocked thread.
827 * What is this good for? it will be always the scheduler or ret_from_fork.
828 */
829#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
830
831#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
832#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
833#endif /* CONFIG_X86_64 */
834
835/* This decides where the kernel will search for a free chunk of vm
836 * space during mmap's.
837 */
838#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
839
840#define KSTK_EIP(task) (task_pt_regs(task)->ip)
841
5#endif 842#endif
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
deleted file mode 100644
index 13976b086837..000000000000
--- a/include/asm-x86/processor_32.h
+++ /dev/null
@@ -1,786 +0,0 @@
1/*
2 * include/asm-i386/processor.h
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 */
6
7#ifndef __ASM_I386_PROCESSOR_H
8#define __ASM_I386_PROCESSOR_H
9
10#include <asm/vm86.h>
11#include <asm/math_emu.h>
12#include <asm/segment.h>
13#include <asm/page.h>
14#include <asm/types.h>
15#include <asm/sigcontext.h>
16#include <asm/cpufeature.h>
17#include <asm/msr.h>
18#include <asm/system.h>
19#include <linux/cache.h>
20#include <linux/threads.h>
21#include <asm/percpu.h>
22#include <linux/cpumask.h>
23#include <linux/init.h>
24#include <asm/processor-flags.h>
25
26/* flag for disabling the tsc */
27extern int tsc_disable;
28
29struct desc_struct {
30 unsigned long a,b;
31};
32
33#define desc_empty(desc) \
34 (!((desc)->a | (desc)->b))
35
36#define desc_equal(desc1, desc2) \
37 (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
38/*
39 * Default implementation of macro that returns current
40 * instruction pointer ("program counter").
41 */
42#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
43
44/*
45 * CPU type and hardware bug flags. Kept separately for each CPU.
46 * Members of this structure are referenced in head.S, so think twice
47 * before touching them. [mj]
48 */
49
50struct cpuinfo_x86 {
51 __u8 x86; /* CPU family */
52 __u8 x86_vendor; /* CPU vendor */
53 __u8 x86_model;
54 __u8 x86_mask;
55 char wp_works_ok; /* It doesn't on 386's */
56 char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
57 char hard_math;
58 char rfu;
59 int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
60 unsigned long x86_capability[NCAPINTS];
61 char x86_vendor_id[16];
62 char x86_model_id[64];
63 int x86_cache_size; /* in KB - valid for CPUS which support this
64 call */
65 int x86_cache_alignment; /* In bytes */
66 char fdiv_bug;
67 char f00f_bug;
68 char coma_bug;
69 char pad0;
70 int x86_power;
71 unsigned long loops_per_jiffy;
72#ifdef CONFIG_SMP
73 cpumask_t llc_shared_map; /* cpus sharing the last level cache */
74#endif
75 unsigned char x86_max_cores; /* cpuid returned max cores value */
76 unsigned char apicid;
77 unsigned short x86_clflush_size;
78#ifdef CONFIG_SMP
79 unsigned char booted_cores; /* number of cores as seen by OS */
80 __u8 phys_proc_id; /* Physical processor id. */
81 __u8 cpu_core_id; /* Core id */
82 __u8 cpu_index; /* index into per_cpu list */
83#endif
84} __attribute__((__aligned__(SMP_CACHE_BYTES)));
85
86#define X86_VENDOR_INTEL 0
87#define X86_VENDOR_CYRIX 1
88#define X86_VENDOR_AMD 2
89#define X86_VENDOR_UMC 3
90#define X86_VENDOR_NEXGEN 4
91#define X86_VENDOR_CENTAUR 5
92#define X86_VENDOR_TRANSMETA 7
93#define X86_VENDOR_NSC 8
94#define X86_VENDOR_NUM 9
95#define X86_VENDOR_UNKNOWN 0xff
96
97/*
98 * capabilities of CPUs
99 */
100
101extern struct cpuinfo_x86 boot_cpu_data;
102extern struct cpuinfo_x86 new_cpu_data;
103extern struct tss_struct doublefault_tss;
104DECLARE_PER_CPU(struct tss_struct, init_tss);
105
106#ifdef CONFIG_SMP
107DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
108#define cpu_data(cpu) per_cpu(cpu_info, cpu)
109#define current_cpu_data cpu_data(smp_processor_id())
110#else
111#define cpu_data(cpu) boot_cpu_data
112#define current_cpu_data boot_cpu_data
113#endif
114
115/*
116 * the following now lives in the per cpu area:
117 * extern int cpu_llc_id[NR_CPUS];
118 */
119DECLARE_PER_CPU(u8, cpu_llc_id);
120extern char ignore_fpu_irq;
121
122void __init cpu_detect(struct cpuinfo_x86 *c);
123
124extern void identify_boot_cpu(void);
125extern void identify_secondary_cpu(struct cpuinfo_x86 *);
126extern void print_cpu_info(struct cpuinfo_x86 *);
127extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
128extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
129extern unsigned short num_cache_leaves;
130
131#ifdef CONFIG_X86_HT
132extern void detect_ht(struct cpuinfo_x86 *c);
133#else
134static inline void detect_ht(struct cpuinfo_x86 *c) {}
135#endif
136
137static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
138 unsigned int *ecx, unsigned int *edx)
139{
140 /* ecx is often an input as well as an output. */
141 __asm__("cpuid"
142 : "=a" (*eax),
143 "=b" (*ebx),
144 "=c" (*ecx),
145 "=d" (*edx)
146 : "0" (*eax), "2" (*ecx));
147}
148
149#define load_cr3(pgdir) write_cr3(__pa(pgdir))
150
151/*
152 * Save the cr4 feature set we're using (ie
153 * Pentium 4MB enable and PPro Global page
154 * enable), so that any CPU's that boot up
155 * after us can get the correct flags.
156 */
157extern unsigned long mmu_cr4_features;
158
159static inline void set_in_cr4 (unsigned long mask)
160{
161 unsigned cr4;
162 mmu_cr4_features |= mask;
163 cr4 = read_cr4();
164 cr4 |= mask;
165 write_cr4(cr4);
166}
167
168static inline void clear_in_cr4 (unsigned long mask)
169{
170 unsigned cr4;
171 mmu_cr4_features &= ~mask;
172 cr4 = read_cr4();
173 cr4 &= ~mask;
174 write_cr4(cr4);
175}
176
177/* Stop speculative execution */
178static inline void sync_core(void)
179{
180 int tmp;
181 asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
182}
183
184static inline void __monitor(const void *eax, unsigned long ecx,
185 unsigned long edx)
186{
187 /* "monitor %eax,%ecx,%edx;" */
188 asm volatile(
189 ".byte 0x0f,0x01,0xc8;"
190 : :"a" (eax), "c" (ecx), "d"(edx));
191}
192
193static inline void __mwait(unsigned long eax, unsigned long ecx)
194{
195 /* "mwait %eax,%ecx;" */
196 asm volatile(
197 ".byte 0x0f,0x01,0xc9;"
198 : :"a" (eax), "c" (ecx));
199}
200
201extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
202
203/* from system description table in BIOS. Mostly for MCA use, but
204others may find it useful. */
205extern unsigned int machine_id;
206extern unsigned int machine_submodel_id;
207extern unsigned int BIOS_revision;
208extern unsigned int mca_pentium_flag;
209
210/* Boot loader type from the setup header */
211extern int bootloader_type;
212
213/*
214 * User space process size: 3GB (default).
215 */
216#define TASK_SIZE (PAGE_OFFSET)
217
218/* This decides where the kernel will search for a free chunk of vm
219 * space during mmap's.
220 */
221#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
222
223#define HAVE_ARCH_PICK_MMAP_LAYOUT
224
225extern void hard_disable_TSC(void);
226extern void disable_TSC(void);
227extern void hard_enable_TSC(void);
228
229/*
230 * Size of io_bitmap.
231 */
232#define IO_BITMAP_BITS 65536
233#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
234#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
235#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
236#define INVALID_IO_BITMAP_OFFSET 0x8000
237#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
238
239struct i387_fsave_struct {
240 long cwd;
241 long swd;
242 long twd;
243 long fip;
244 long fcs;
245 long foo;
246 long fos;
247 long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
248 long status; /* software status information */
249};
250
251struct i387_fxsave_struct {
252 unsigned short cwd;
253 unsigned short swd;
254 unsigned short twd;
255 unsigned short fop;
256 long fip;
257 long fcs;
258 long foo;
259 long fos;
260 long mxcsr;
261 long mxcsr_mask;
262 long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
263 long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
264 long padding[56];
265} __attribute__ ((aligned (16)));
266
267struct i387_soft_struct {
268 long cwd;
269 long swd;
270 long twd;
271 long fip;
272 long fcs;
273 long foo;
274 long fos;
275 long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
276 unsigned char ftop, changed, lookahead, no_update, rm, alimit;
277 struct info *info;
278 unsigned long entry_eip;
279};
280
281union i387_union {
282 struct i387_fsave_struct fsave;
283 struct i387_fxsave_struct fxsave;
284 struct i387_soft_struct soft;
285};
286
287typedef struct {
288 unsigned long seg;
289} mm_segment_t;
290
291struct thread_struct;
292
293/* This is the TSS defined by the hardware. */
294struct i386_hw_tss {
295 unsigned short back_link,__blh;
296 unsigned long esp0;
297 unsigned short ss0,__ss0h;
298 unsigned long esp1;
299 unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
300 unsigned long esp2;
301 unsigned short ss2,__ss2h;
302 unsigned long __cr3;
303 unsigned long eip;
304 unsigned long eflags;
305 unsigned long eax,ecx,edx,ebx;
306 unsigned long esp;
307 unsigned long ebp;
308 unsigned long esi;
309 unsigned long edi;
310 unsigned short es, __esh;
311 unsigned short cs, __csh;
312 unsigned short ss, __ssh;
313 unsigned short ds, __dsh;
314 unsigned short fs, __fsh;
315 unsigned short gs, __gsh;
316 unsigned short ldt, __ldth;
317 unsigned short trace, io_bitmap_base;
318} __attribute__((packed));
319
320struct tss_struct {
321 struct i386_hw_tss x86_tss;
322
323 /*
324 * The extra 1 is there because the CPU will access an
325 * additional byte beyond the end of the IO permission
326 * bitmap. The extra byte must be all 1 bits, and must
327 * be within the limit.
328 */
329 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
330 /*
331 * Cache the current maximum and the last task that used the bitmap:
332 */
333 unsigned long io_bitmap_max;
334 struct thread_struct *io_bitmap_owner;
335 /*
336 * pads the TSS to be cacheline-aligned (size is 0x100)
337 */
338 unsigned long __cacheline_filler[35];
339 /*
340 * .. and then another 0x100 bytes for emergency kernel stack
341 */
342 unsigned long stack[64];
343} __attribute__((packed));
344
345#define ARCH_MIN_TASKALIGN 16
346
347struct thread_struct {
348/* cached TLS descriptors. */
349 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
350 unsigned long esp0;
351 unsigned long sysenter_cs;
352 unsigned long eip;
353 unsigned long esp;
354 unsigned long fs;
355 unsigned long gs;
356/* Hardware debugging registers */
357 unsigned long debugreg[8]; /* %%db0-7 debug registers */
358/* fault info */
359 unsigned long cr2, trap_no, error_code;
360/* floating point info */
361 union i387_union i387;
362/* virtual 86 mode info */
363 struct vm86_struct __user * vm86_info;
364 unsigned long screen_bitmap;
365 unsigned long v86flags, v86mask, saved_esp0;
366 unsigned int saved_fs, saved_gs;
367/* IO permissions */
368 unsigned long *io_bitmap_ptr;
369 unsigned long iopl;
370/* max allowed port in the bitmap, in bytes: */
371 unsigned long io_bitmap_max;
372};
373
374#define INIT_THREAD { \
375 .esp0 = sizeof(init_stack) + (long)&init_stack, \
376 .vm86_info = NULL, \
377 .sysenter_cs = __KERNEL_CS, \
378 .io_bitmap_ptr = NULL, \
379 .fs = __KERNEL_PERCPU, \
380}
381
382/*
383 * Note that the .io_bitmap member must be extra-big. This is because
384 * the CPU will access an additional byte beyond the end of the IO
385 * permission bitmap. The extra byte must be all 1 bits, and must
386 * be within the limit.
387 */
388#define INIT_TSS { \
389 .x86_tss = { \
390 .esp0 = sizeof(init_stack) + (long)&init_stack, \
391 .ss0 = __KERNEL_DS, \
392 .ss1 = __KERNEL_CS, \
393 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
394 }, \
395 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
396}
397
398#define start_thread(regs, new_eip, new_esp) do { \
399 __asm__("movl %0,%%gs": :"r" (0)); \
400 regs->xfs = 0; \
401 set_fs(USER_DS); \
402 regs->xds = __USER_DS; \
403 regs->xes = __USER_DS; \
404 regs->xss = __USER_DS; \
405 regs->xcs = __USER_CS; \
406 regs->eip = new_eip; \
407 regs->esp = new_esp; \
408} while (0)
409
410/* Forward declaration, a strange C thing */
411struct task_struct;
412struct mm_struct;
413
414/* Free all resources held by a thread. */
415extern void release_thread(struct task_struct *);
416
417/* Prepare to copy thread state - unlazy all lazy status */
418extern void prepare_to_copy(struct task_struct *tsk);
419
420/*
421 * create a kernel thread without removing it from tasklists
422 */
423extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
424
425extern unsigned long thread_saved_pc(struct task_struct *tsk);
426void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
427
428unsigned long get_wchan(struct task_struct *p);
429
430#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
431#define KSTK_TOP(info) \
432({ \
433 unsigned long *__ptr = (unsigned long *)(info); \
434 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
435})
436
437/*
438 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
439 * This is necessary to guarantee that the entire "struct pt_regs"
440 * is accessable even if the CPU haven't stored the SS/ESP registers
441 * on the stack (interrupt gate does not save these registers
442 * when switching to the same priv ring).
443 * Therefore beware: accessing the xss/esp fields of the
444 * "struct pt_regs" is possible, but they may contain the
445 * completely wrong values.
446 */
447#define task_pt_regs(task) \
448({ \
449 struct pt_regs *__regs__; \
450 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
451 __regs__ - 1; \
452})
453
454#define KSTK_EIP(task) (task_pt_regs(task)->eip)
455#define KSTK_ESP(task) (task_pt_regs(task)->esp)
456
457
458struct microcode_header {
459 unsigned int hdrver;
460 unsigned int rev;
461 unsigned int date;
462 unsigned int sig;
463 unsigned int cksum;
464 unsigned int ldrver;
465 unsigned int pf;
466 unsigned int datasize;
467 unsigned int totalsize;
468 unsigned int reserved[3];
469};
470
471struct microcode {
472 struct microcode_header hdr;
473 unsigned int bits[0];
474};
475
476typedef struct microcode microcode_t;
477typedef struct microcode_header microcode_header_t;
478
479/* microcode format is extended from prescott processors */
480struct extended_signature {
481 unsigned int sig;
482 unsigned int pf;
483 unsigned int cksum;
484};
485
486struct extended_sigtable {
487 unsigned int count;
488 unsigned int cksum;
489 unsigned int reserved[3];
490 struct extended_signature sigs[0];
491};
492
493/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
494static inline void rep_nop(void)
495{
496 __asm__ __volatile__("rep;nop": : :"memory");
497}
498
499#define cpu_relax() rep_nop()
500
501static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
502{
503 tss->x86_tss.esp0 = thread->esp0;
504 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
505 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
506 tss->x86_tss.ss1 = thread->sysenter_cs;
507 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
508 }
509}
510
511
512static inline unsigned long native_get_debugreg(int regno)
513{
514 unsigned long val = 0; /* Damn you, gcc! */
515
516 switch (regno) {
517 case 0:
518 asm("movl %%db0, %0" :"=r" (val)); break;
519 case 1:
520 asm("movl %%db1, %0" :"=r" (val)); break;
521 case 2:
522 asm("movl %%db2, %0" :"=r" (val)); break;
523 case 3:
524 asm("movl %%db3, %0" :"=r" (val)); break;
525 case 6:
526 asm("movl %%db6, %0" :"=r" (val)); break;
527 case 7:
528 asm("movl %%db7, %0" :"=r" (val)); break;
529 default:
530 BUG();
531 }
532 return val;
533}
534
535static inline void native_set_debugreg(int regno, unsigned long value)
536{
537 switch (regno) {
538 case 0:
539 asm("movl %0,%%db0" : /* no output */ :"r" (value));
540 break;
541 case 1:
542 asm("movl %0,%%db1" : /* no output */ :"r" (value));
543 break;
544 case 2:
545 asm("movl %0,%%db2" : /* no output */ :"r" (value));
546 break;
547 case 3:
548 asm("movl %0,%%db3" : /* no output */ :"r" (value));
549 break;
550 case 6:
551 asm("movl %0,%%db6" : /* no output */ :"r" (value));
552 break;
553 case 7:
554 asm("movl %0,%%db7" : /* no output */ :"r" (value));
555 break;
556 default:
557 BUG();
558 }
559}
560
561/*
562 * Set IOPL bits in EFLAGS from given mask
563 */
564static inline void native_set_iopl_mask(unsigned mask)
565{
566 unsigned int reg;
567 __asm__ __volatile__ ("pushfl;"
568 "popl %0;"
569 "andl %1, %0;"
570 "orl %2, %0;"
571 "pushl %0;"
572 "popfl"
573 : "=&r" (reg)
574 : "i" (~X86_EFLAGS_IOPL), "r" (mask));
575}
576
577#ifdef CONFIG_PARAVIRT
578#include <asm/paravirt.h>
579#else
580#define paravirt_enabled() 0
581#define __cpuid native_cpuid
582
583static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
584{
585 native_load_esp0(tss, thread);
586}
587
588/*
589 * These special macros can be used to get or set a debugging register
590 */
591#define get_debugreg(var, register) \
592 (var) = native_get_debugreg(register)
593#define set_debugreg(value, register) \
594 native_set_debugreg(register, value)
595
596#define set_iopl_mask native_set_iopl_mask
597#endif /* CONFIG_PARAVIRT */
598
599/*
600 * Generic CPUID function
601 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
602 * resulting in stale register contents being returned.
603 */
604static inline void cpuid(unsigned int op,
605 unsigned int *eax, unsigned int *ebx,
606 unsigned int *ecx, unsigned int *edx)
607{
608 *eax = op;
609 *ecx = 0;
610 __cpuid(eax, ebx, ecx, edx);
611}
612
613/* Some CPUID calls want 'count' to be placed in ecx */
614static inline void cpuid_count(unsigned int op, int count,
615 unsigned int *eax, unsigned int *ebx,
616 unsigned int *ecx, unsigned int *edx)
617{
618 *eax = op;
619 *ecx = count;
620 __cpuid(eax, ebx, ecx, edx);
621}
622
623/*
624 * CPUID functions returning a single datum
625 */
626static inline unsigned int cpuid_eax(unsigned int op)
627{
628 unsigned int eax, ebx, ecx, edx;
629
630 cpuid(op, &eax, &ebx, &ecx, &edx);
631 return eax;
632}
633static inline unsigned int cpuid_ebx(unsigned int op)
634{
635 unsigned int eax, ebx, ecx, edx;
636
637 cpuid(op, &eax, &ebx, &ecx, &edx);
638 return ebx;
639}
640static inline unsigned int cpuid_ecx(unsigned int op)
641{
642 unsigned int eax, ebx, ecx, edx;
643
644 cpuid(op, &eax, &ebx, &ecx, &edx);
645 return ecx;
646}
647static inline unsigned int cpuid_edx(unsigned int op)
648{
649 unsigned int eax, ebx, ecx, edx;
650
651 cpuid(op, &eax, &ebx, &ecx, &edx);
652 return edx;
653}
654
655/* generic versions from gas */
656#define GENERIC_NOP1 ".byte 0x90\n"
657#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
658#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
659#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
660#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
661#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
662#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
663#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
664
665/* Opteron nops */
666#define K8_NOP1 GENERIC_NOP1
667#define K8_NOP2 ".byte 0x66,0x90\n"
668#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
669#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
670#define K8_NOP5 K8_NOP3 K8_NOP2
671#define K8_NOP6 K8_NOP3 K8_NOP3
672#define K8_NOP7 K8_NOP4 K8_NOP3
673#define K8_NOP8 K8_NOP4 K8_NOP4
674
675/* K7 nops */
676/* uses eax dependencies (arbitary choice) */
677#define K7_NOP1 GENERIC_NOP1
678#define K7_NOP2 ".byte 0x8b,0xc0\n"
679#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
680#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
681#define K7_NOP5 K7_NOP4 ASM_NOP1
682#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
683#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
684#define K7_NOP8 K7_NOP7 ASM_NOP1
685
686/* P6 nops */
687/* uses eax dependencies (Intel-recommended choice) */
688#define P6_NOP1 GENERIC_NOP1
689#define P6_NOP2 ".byte 0x66,0x90\n"
690#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
691#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
692#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
693#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
694#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
695#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
696
697#ifdef CONFIG_MK8
698#define ASM_NOP1 K8_NOP1
699#define ASM_NOP2 K8_NOP2
700#define ASM_NOP3 K8_NOP3
701#define ASM_NOP4 K8_NOP4
702#define ASM_NOP5 K8_NOP5
703#define ASM_NOP6 K8_NOP6
704#define ASM_NOP7 K8_NOP7
705#define ASM_NOP8 K8_NOP8
706#elif defined(CONFIG_MK7)
707#define ASM_NOP1 K7_NOP1
708#define ASM_NOP2 K7_NOP2
709#define ASM_NOP3 K7_NOP3
710#define ASM_NOP4 K7_NOP4
711#define ASM_NOP5 K7_NOP5
712#define ASM_NOP6 K7_NOP6
713#define ASM_NOP7 K7_NOP7
714#define ASM_NOP8 K7_NOP8
715#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
716 defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
717 defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
718#define ASM_NOP1 P6_NOP1
719#define ASM_NOP2 P6_NOP2
720#define ASM_NOP3 P6_NOP3
721#define ASM_NOP4 P6_NOP4
722#define ASM_NOP5 P6_NOP5
723#define ASM_NOP6 P6_NOP6
724#define ASM_NOP7 P6_NOP7
725#define ASM_NOP8 P6_NOP8
726#else
727#define ASM_NOP1 GENERIC_NOP1
728#define ASM_NOP2 GENERIC_NOP2
729#define ASM_NOP3 GENERIC_NOP3
730#define ASM_NOP4 GENERIC_NOP4
731#define ASM_NOP5 GENERIC_NOP5
732#define ASM_NOP6 GENERIC_NOP6
733#define ASM_NOP7 GENERIC_NOP7
734#define ASM_NOP8 GENERIC_NOP8
735#endif
736
737#define ASM_NOP_MAX 8
738
739/* Prefetch instructions for Pentium III and AMD Athlon */
740/* It's not worth to care about 3dnow! prefetches for the K6
741 because they are microcoded there and very slow.
742 However we don't do prefetches for pre XP Athlons currently
743 That should be fixed. */
744#define ARCH_HAS_PREFETCH
745static inline void prefetch(const void *x)
746{
747 alternative_input(ASM_NOP4,
748 "prefetchnta (%1)",
749 X86_FEATURE_XMM,
750 "r" (x));
751}
752
753#define ARCH_HAS_PREFETCH
754#define ARCH_HAS_PREFETCHW
755#define ARCH_HAS_SPINLOCK_PREFETCH
756
757/* 3dnow! prefetch to get an exclusive cache line. Useful for
758 spinlocks to avoid one state transition in the cache coherency protocol. */
759static inline void prefetchw(const void *x)
760{
761 alternative_input(ASM_NOP4,
762 "prefetchw (%1)",
763 X86_FEATURE_3DNOW,
764 "r" (x));
765}
766#define spin_lock_prefetch(x) prefetchw(x)
767
768extern void select_idle_routine(const struct cpuinfo_x86 *c);
769
770#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
771
772extern unsigned long boot_option_idle_override;
773extern void enable_sep_cpu(void);
774extern int sysenter_setup(void);
775
776/* Defined in head.S */
777extern struct Xgt_desc_struct early_gdt_descr;
778
779extern void cpu_set_gdt(int);
780extern void switch_to_new_gdt(void);
781extern void cpu_init(void);
782extern void init_gdt(int cpu);
783
784extern int force_mwait;
785
786#endif /* __ASM_I386_PROCESSOR_H */
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
deleted file mode 100644
index e4f19970a82b..000000000000
--- a/include/asm-x86/processor_64.h
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * include/asm-x86_64/processor.h
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 */
6
7#ifndef __ASM_X86_64_PROCESSOR_H
8#define __ASM_X86_64_PROCESSOR_H
9
10#include <asm/segment.h>
11#include <asm/page.h>
12#include <asm/types.h>
13#include <asm/sigcontext.h>
14#include <asm/cpufeature.h>
15#include <linux/threads.h>
16#include <asm/msr.h>
17#include <asm/current.h>
18#include <asm/system.h>
19#include <asm/mmsegment.h>
20#include <asm/percpu.h>
21#include <linux/personality.h>
22#include <linux/cpumask.h>
23#include <asm/processor-flags.h>
24
25#define TF_MASK 0x00000100
26#define IF_MASK 0x00000200
27#define IOPL_MASK 0x00003000
28#define NT_MASK 0x00004000
29#define VM_MASK 0x00020000
30#define AC_MASK 0x00040000
31#define VIF_MASK 0x00080000 /* virtual interrupt flag */
32#define VIP_MASK 0x00100000 /* virtual interrupt pending */
33#define ID_MASK 0x00200000
34
35#define desc_empty(desc) \
36 (!((desc)->a | (desc)->b))
37
38#define desc_equal(desc1, desc2) \
39 (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
40
41/*
42 * Default implementation of macro that returns current
43 * instruction pointer ("program counter").
44 */
45#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
46
47/*
48 * CPU type and hardware bug flags. Kept separately for each CPU.
49 */
50
51struct cpuinfo_x86 {
52 __u8 x86; /* CPU family */
53 __u8 x86_vendor; /* CPU vendor */
54 __u8 x86_model;
55 __u8 x86_mask;
56 int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
57 __u32 x86_capability[NCAPINTS];
58 char x86_vendor_id[16];
59 char x86_model_id[64];
60 int x86_cache_size; /* in KB */
61 int x86_clflush_size;
62 int x86_cache_alignment;
63 int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
64 __u8 x86_virt_bits, x86_phys_bits;
65 __u8 x86_max_cores; /* cpuid returned max cores value */
66 __u32 x86_power;
67 __u32 extended_cpuid_level; /* Max extended CPUID function supported */
68 unsigned long loops_per_jiffy;
69#ifdef CONFIG_SMP
70 cpumask_t llc_shared_map; /* cpus sharing the last level cache */
71#endif
72 __u8 apicid;
73#ifdef CONFIG_SMP
74 __u8 booted_cores; /* number of cores as seen by OS */
75 __u8 phys_proc_id; /* Physical Processor id. */
76 __u8 cpu_core_id; /* Core id. */
77 __u8 cpu_index; /* index into per_cpu list */
78#endif
79} ____cacheline_aligned;
80
81#define X86_VENDOR_INTEL 0
82#define X86_VENDOR_CYRIX 1
83#define X86_VENDOR_AMD 2
84#define X86_VENDOR_UMC 3
85#define X86_VENDOR_NEXGEN 4
86#define X86_VENDOR_CENTAUR 5
87#define X86_VENDOR_TRANSMETA 7
88#define X86_VENDOR_NUM 8
89#define X86_VENDOR_UNKNOWN 0xff
90
91#ifdef CONFIG_SMP
92DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
93#define cpu_data(cpu) per_cpu(cpu_info, cpu)
94#define current_cpu_data cpu_data(smp_processor_id())
95#else
96#define cpu_data(cpu) boot_cpu_data
97#define current_cpu_data boot_cpu_data
98#endif
99
100extern char ignore_irq13;
101
102extern void identify_cpu(struct cpuinfo_x86 *);
103extern void print_cpu_info(struct cpuinfo_x86 *);
104extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
105extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
106extern unsigned short num_cache_leaves;
107
108/*
109 * Save the cr4 feature set we're using (ie
110 * Pentium 4MB enable and PPro Global page
111 * enable), so that any CPU's that boot up
112 * after us can get the correct flags.
113 */
114extern unsigned long mmu_cr4_features;
115
116static inline void set_in_cr4 (unsigned long mask)
117{
118 mmu_cr4_features |= mask;
119 __asm__("movq %%cr4,%%rax\n\t"
120 "orq %0,%%rax\n\t"
121 "movq %%rax,%%cr4\n"
122 : : "irg" (mask)
123 :"ax");
124}
125
126static inline void clear_in_cr4 (unsigned long mask)
127{
128 mmu_cr4_features &= ~mask;
129 __asm__("movq %%cr4,%%rax\n\t"
130 "andq %0,%%rax\n\t"
131 "movq %%rax,%%cr4\n"
132 : : "irg" (~mask)
133 :"ax");
134}
135
136
137/*
138 * User space process size. 47bits minus one guard page.
139 */
140#define TASK_SIZE64 (0x800000000000UL - 4096)
141
142/* This decides where the kernel will search for a free chunk of vm
143 * space during mmap's.
144 */
145#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
146
147#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
148#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
149
150#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
151
152/*
153 * Size of io_bitmap.
154 */
155#define IO_BITMAP_BITS 65536
156#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
157#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
158#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
159#define INVALID_IO_BITMAP_OFFSET 0x8000
160
161struct i387_fxsave_struct {
162 u16 cwd;
163 u16 swd;
164 u16 twd;
165 u16 fop;
166 u64 rip;
167 u64 rdp;
168 u32 mxcsr;
169 u32 mxcsr_mask;
170 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
171 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
172 u32 padding[24];
173} __attribute__ ((aligned (16)));
174
175union i387_union {
176 struct i387_fxsave_struct fxsave;
177};
178
179struct tss_struct {
180 u32 reserved1;
181 u64 rsp0;
182 u64 rsp1;
183 u64 rsp2;
184 u64 reserved2;
185 u64 ist[7];
186 u32 reserved3;
187 u32 reserved4;
188 u16 reserved5;
189 u16 io_bitmap_base;
190 /*
191 * The extra 1 is there because the CPU will access an
192 * additional byte beyond the end of the IO permission
193 * bitmap. The extra byte must be all 1 bits, and must
194 * be within the limit. Thus we have:
195 *
196 * 128 bytes, the bitmap itself, for ports 0..0x3ff
197 * 8 bytes, for an extra "long" of ~0UL
198 */
199 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
200} __attribute__((packed)) ____cacheline_aligned;
201
202
203extern struct cpuinfo_x86 boot_cpu_data;
204DECLARE_PER_CPU(struct tss_struct,init_tss);
205/* Save the original ist values for checking stack pointers during debugging */
206struct orig_ist {
207 unsigned long ist[7];
208};
209DECLARE_PER_CPU(struct orig_ist, orig_ist);
210
211#ifdef CONFIG_X86_VSMP
212#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
213#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
214#else
215#define ARCH_MIN_TASKALIGN 16
216#define ARCH_MIN_MMSTRUCT_ALIGN 0
217#endif
218
219struct thread_struct {
220 unsigned long rsp0;
221 unsigned long rsp;
222 unsigned long userrsp; /* Copy from PDA */
223 unsigned long fs;
224 unsigned long gs;
225 unsigned short es, ds, fsindex, gsindex;
226/* Hardware debugging registers */
227 unsigned long debugreg0;
228 unsigned long debugreg1;
229 unsigned long debugreg2;
230 unsigned long debugreg3;
231 unsigned long debugreg6;
232 unsigned long debugreg7;
233/* fault info */
234 unsigned long cr2, trap_no, error_code;
235/* floating point info */
236 union i387_union i387 __attribute__((aligned(16)));
237/* IO permissions. the bitmap could be moved into the GDT, that would make
238 switch faster for a limited number of ioperm using tasks. -AK */
239 int ioperm;
240 unsigned long *io_bitmap_ptr;
241 unsigned io_bitmap_max;
242/* cached TLS descriptors. */
243 u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
244} __attribute__((aligned(16)));
245
246#define INIT_THREAD { \
247 .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
248}
249
250#define INIT_TSS { \
251 .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
252}
253
254#define INIT_MMAP \
255{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
256
257#define start_thread(regs,new_rip,new_rsp) do { \
258 asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
259 load_gs_index(0); \
260 (regs)->rip = (new_rip); \
261 (regs)->rsp = (new_rsp); \
262 write_pda(oldrsp, (new_rsp)); \
263 (regs)->cs = __USER_CS; \
264 (regs)->ss = __USER_DS; \
265 (regs)->eflags = 0x200; \
266 set_fs(USER_DS); \
267} while(0)
268
269#define get_debugreg(var, register) \
270 __asm__("movq %%db" #register ", %0" \
271 :"=r" (var))
272#define set_debugreg(value, register) \
273 __asm__("movq %0,%%db" #register \
274 : /* no output */ \
275 :"r" (value))
276
277struct task_struct;
278struct mm_struct;
279
280/* Free all resources held by a thread. */
281extern void release_thread(struct task_struct *);
282
283/* Prepare to copy thread state - unlazy all lazy status */
284extern void prepare_to_copy(struct task_struct *tsk);
285
286/*
287 * create a kernel thread without removing it from tasklists
288 */
289extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
290
291/*
292 * Return saved PC of a blocked thread.
293 * What is this good for? it will be always the scheduler or ret_from_fork.
294 */
295#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
296
297extern unsigned long get_wchan(struct task_struct *p);
298#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
299#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
300#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
301
302
303struct microcode_header {
304 unsigned int hdrver;
305 unsigned int rev;
306 unsigned int date;
307 unsigned int sig;
308 unsigned int cksum;
309 unsigned int ldrver;
310 unsigned int pf;
311 unsigned int datasize;
312 unsigned int totalsize;
313 unsigned int reserved[3];
314};
315
316struct microcode {
317 struct microcode_header hdr;
318 unsigned int bits[0];
319};
320
321typedef struct microcode microcode_t;
322typedef struct microcode_header microcode_header_t;
323
324/* microcode format is extended from prescott processors */
325struct extended_signature {
326 unsigned int sig;
327 unsigned int pf;
328 unsigned int cksum;
329};
330
331struct extended_sigtable {
332 unsigned int count;
333 unsigned int cksum;
334 unsigned int reserved[3];
335 struct extended_signature sigs[0];
336};
337
338
339#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
340#define ASM_NOP1 P6_NOP1
341#define ASM_NOP2 P6_NOP2
342#define ASM_NOP3 P6_NOP3
343#define ASM_NOP4 P6_NOP4
344#define ASM_NOP5 P6_NOP5
345#define ASM_NOP6 P6_NOP6
346#define ASM_NOP7 P6_NOP7
347#define ASM_NOP8 P6_NOP8
348#else
349#define ASM_NOP1 K8_NOP1
350#define ASM_NOP2 K8_NOP2
351#define ASM_NOP3 K8_NOP3
352#define ASM_NOP4 K8_NOP4
353#define ASM_NOP5 K8_NOP5
354#define ASM_NOP6 K8_NOP6
355#define ASM_NOP7 K8_NOP7
356#define ASM_NOP8 K8_NOP8
357#endif
358
359/* Opteron nops */
360#define K8_NOP1 ".byte 0x90\n"
361#define K8_NOP2 ".byte 0x66,0x90\n"
362#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
363#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
364#define K8_NOP5 K8_NOP3 K8_NOP2
365#define K8_NOP6 K8_NOP3 K8_NOP3
366#define K8_NOP7 K8_NOP4 K8_NOP3
367#define K8_NOP8 K8_NOP4 K8_NOP4
368
369/* P6 nops */
370/* uses eax dependencies (Intel-recommended choice) */
371#define P6_NOP1 ".byte 0x90\n"
372#define P6_NOP2 ".byte 0x66,0x90\n"
373#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
374#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
375#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
376#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
377#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
378#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
379
380#define ASM_NOP_MAX 8
381
382/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
383static inline void rep_nop(void)
384{
385 __asm__ __volatile__("rep;nop": : :"memory");
386}
387
388/* Stop speculative execution */
389static inline void sync_core(void)
390{
391 int tmp;
392 asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
393}
394
395#define ARCH_HAS_PREFETCHW 1
396static inline void prefetchw(void *x)
397{
398 alternative_input("prefetcht0 (%1)",
399 "prefetchw (%1)",
400 X86_FEATURE_3DNOW,
401 "r" (x));
402}
403
404#define ARCH_HAS_SPINLOCK_PREFETCH 1
405
406#define spin_lock_prefetch(x) prefetchw(x)
407
408#define cpu_relax() rep_nop()
409
410static inline void __monitor(const void *eax, unsigned long ecx,
411 unsigned long edx)
412{
413 /* "monitor %eax,%ecx,%edx;" */
414 asm volatile(
415 ".byte 0x0f,0x01,0xc8;"
416 : :"a" (eax), "c" (ecx), "d"(edx));
417}
418
419static inline void __mwait(unsigned long eax, unsigned long ecx)
420{
421 /* "mwait %eax,%ecx;" */
422 asm volatile(
423 ".byte 0x0f,0x01,0xc9;"
424 : :"a" (eax), "c" (ecx));
425}
426
427static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
428{
429 /* "mwait %eax,%ecx;" */
430 asm volatile(
431 "sti; .byte 0x0f,0x01,0xc9;"
432 : :"a" (eax), "c" (ecx));
433}
434
435extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
436
437#define stack_current() \
438({ \
439 struct thread_info *ti; \
440 asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
441 ti->task; \
442})
443
444#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
445
446extern unsigned long boot_option_idle_override;
447/* Boot loader type from the setup header */
448extern int bootloader_type;
449
450#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
451
452#endif /* __ASM_X86_64_PROCESSOR_H */
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index dabba55f7ed8..68563c0709ac 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -5,87 +5,24 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8struct cpuinfo_x86;
9struct pt_regs;
10
11extern void start_kernel(void);
12extern void pda_init(int);
13
14extern void early_idt_handler(void); 8extern void early_idt_handler(void);
15 9
16extern void mcheck_init(struct cpuinfo_x86 *c);
17extern void init_memory_mapping(unsigned long start, unsigned long end); 10extern void init_memory_mapping(unsigned long start, unsigned long end);
18 11
19extern void system_call(void); 12extern void system_call(void);
20extern int kernel_syscall(void);
21extern void syscall_init(void); 13extern void syscall_init(void);
22 14
23extern void ia32_syscall(void); 15extern void ia32_syscall(void);
24extern void ia32_cstar_target(void); 16extern void ia32_cstar_target(void);
25extern void ia32_sysenter_target(void); 17extern void ia32_sysenter_target(void);
26
27extern void config_acpi_tables(void);
28extern void ia32_syscall(void);
29
30extern int pmtimer_mark_offset(void);
31extern void pmtimer_resume(void);
32extern void pmtimer_wait(unsigned);
33extern unsigned int do_gettimeoffset_pm(void);
34#ifdef CONFIG_X86_PM_TIMER
35extern u32 pmtmr_ioport;
36#else
37#define pmtmr_ioport 0
38#endif
39extern int nohpet;
40
41extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
42
43extern void early_identify_cpu(struct cpuinfo_x86 *c);
44
45extern int k8_scan_nodes(unsigned long start, unsigned long end);
46
47extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
48extern unsigned long numa_free_all_bootmem(void);
49 18
50extern void reserve_bootmem_generic(unsigned long phys, unsigned len); 19extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
51 20
52extern void load_gs_index(unsigned gs);
53
54extern unsigned long end_pfn_map;
55
56extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp);
57extern void show_registers(struct pt_regs *regs);
58
59extern void exception_table_check(void);
60
61extern void acpi_reserve_bootmem(void);
62
63extern void swap_low_mappings(void);
64
65extern void __show_regs(struct pt_regs * regs);
66extern void show_regs(struct pt_regs * regs);
67
68extern void syscall32_cpu_init(void); 21extern void syscall32_cpu_init(void);
69 22
70extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
71
72extern void early_quirks(void);
73extern void check_efer(void); 23extern void check_efer(void);
74 24
75extern void select_idle_routine(const struct cpuinfo_x86 *c);
76
77extern unsigned long table_start, table_end;
78
79extern int exception_trace;
80extern unsigned cpu_khz;
81extern unsigned tsc_khz;
82
83extern int reboot_force; 25extern int reboot_force;
84extern int notsc_setup(char *);
85
86extern int gsi_irq_sharing(int gsi);
87
88extern int force_mwait;
89 26
90long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 27long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
91 28
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index 7524e1233833..81a8ee4c55fc 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -78,4 +78,66 @@
78# define PTRACE_SYSEMU_SINGLESTEP 32 78# define PTRACE_SYSEMU_SINGLESTEP 32
79#endif 79#endif
80 80
81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
82
83#ifndef __ASSEMBLY__
84
85#include <asm/types.h>
86
87/* configuration/status structure used in PTRACE_BTS_CONFIG and
88 PTRACE_BTS_STATUS commands.
89*/
90struct ptrace_bts_config {
91 /* requested or actual size of BTS buffer in bytes */
92 u32 size;
93 /* bitmask of below flags */
94 u32 flags;
95 /* buffer overflow signal */
96 u32 signal;
97 /* actual size of bts_struct in bytes */
98 u32 bts_size;
99};
100#endif
101
102#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
103#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
104#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
105 instead of wrapping around */
106#define PTRACE_BTS_O_CUT_SIZE 0x8 /* cut requested size to max available
107 instead of failing */
108
109#define PTRACE_BTS_CONFIG 40
110/* Configure branch trace recording.
111 ADDR points to a struct ptrace_bts_config.
112 DATA gives the size of that buffer.
113 A new buffer is allocated, iff the size changes.
114 Returns the number of bytes read.
115*/
116#define PTRACE_BTS_STATUS 41
117/* Return the current configuration in a struct ptrace_bts_config
118 pointed to by ADDR; DATA gives the size of that buffer.
119 Returns the number of bytes written.
120*/
121#define PTRACE_BTS_SIZE 42
122/* Return the number of available BTS records.
123 DATA and ADDR are ignored.
124*/
125#define PTRACE_BTS_GET 43
126/* Get a single BTS record.
127 DATA defines the index into the BTS array, where 0 is the newest
128 entry, and higher indices refer to older entries.
129 ADDR is pointing to struct bts_struct (see asm/ds.h).
130*/
131#define PTRACE_BTS_CLEAR 44
132/* Clear the BTS buffer.
133 DATA and ADDR are ignored.
134*/
135#define PTRACE_BTS_DRAIN 45
136/* Read all available BTS records and clear the buffer.
137 ADDR points to an array of struct bts_struct.
138 DATA gives the size of that buffer.
139 BTS records are read from oldest to newest.
140 Returns number of BTS records drained.
141*/
142
81#endif 143#endif
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 51ddb2590870..d9e04b46a440 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -4,12 +4,15 @@
4#include <linux/compiler.h> /* For __user */ 4#include <linux/compiler.h> /* For __user */
5#include <asm/ptrace-abi.h> 5#include <asm/ptrace-abi.h>
6 6
7
7#ifndef __ASSEMBLY__ 8#ifndef __ASSEMBLY__
8 9
9#ifdef __i386__ 10#ifdef __i386__
10/* this struct defines the way the registers are stored on the 11/* this struct defines the way the registers are stored on the
11 stack during a system call. */ 12 stack during a system call. */
12 13
14#ifndef __KERNEL__
15
13struct pt_regs { 16struct pt_regs {
14 long ebx; 17 long ebx;
15 long ecx; 18 long ecx;
@@ -21,7 +24,7 @@ struct pt_regs {
21 int xds; 24 int xds;
22 int xes; 25 int xes;
23 int xfs; 26 int xfs;
24 /* int xgs; */ 27 /* int gs; */
25 long orig_eax; 28 long orig_eax;
26 long eip; 29 long eip;
27 int xcs; 30 int xcs;
@@ -30,44 +33,37 @@ struct pt_regs {
30 int xss; 33 int xss;
31}; 34};
32 35
33#ifdef __KERNEL__ 36#else /* __KERNEL__ */
37
38struct pt_regs {
39 long bx;
40 long cx;
41 long dx;
42 long si;
43 long di;
44 long bp;
45 long ax;
46 int ds;
47 int es;
48 int fs;
49 /* int gs; */
50 long orig_ax;
51 long ip;
52 int cs;
53 long flags;
54 long sp;
55 int ss;
56};
34 57
35#include <asm/vm86.h> 58#include <asm/vm86.h>
36#include <asm/segment.h> 59#include <asm/segment.h>
37 60
38struct task_struct;
39extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
40
41/*
42 * user_mode_vm(regs) determines whether a register set came from user mode.
43 * This is true if V8086 mode was enabled OR if the register set was from
44 * protected mode with RPL-3 CS value. This tricky test checks that with
45 * one comparison. Many places in the kernel can bypass this full check
46 * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
47 */
48static inline int user_mode(struct pt_regs *regs)
49{
50 return (regs->xcs & SEGMENT_RPL_MASK) == USER_RPL;
51}
52static inline int user_mode_vm(struct pt_regs *regs)
53{
54 return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
55}
56static inline int v8086_mode(struct pt_regs *regs)
57{
58 return (regs->eflags & VM_MASK);
59}
60
61#define instruction_pointer(regs) ((regs)->eip)
62#define frame_pointer(regs) ((regs)->ebp)
63#define stack_pointer(regs) ((unsigned long)(regs))
64#define regs_return_value(regs) ((regs)->eax)
65
66extern unsigned long profile_pc(struct pt_regs *regs);
67#endif /* __KERNEL__ */ 61#endif /* __KERNEL__ */
68 62
69#else /* __i386__ */ 63#else /* __i386__ */
70 64
65#ifndef __KERNEL__
66
71struct pt_regs { 67struct pt_regs {
72 unsigned long r15; 68 unsigned long r15;
73 unsigned long r14; 69 unsigned long r14;
@@ -96,47 +92,143 @@ struct pt_regs {
96/* top of stack page */ 92/* top of stack page */
97}; 93};
98 94
95#else /* __KERNEL__ */
96
97struct pt_regs {
98 unsigned long r15;
99 unsigned long r14;
100 unsigned long r13;
101 unsigned long r12;
102 unsigned long bp;
103 unsigned long bx;
104/* arguments: non interrupts/non tracing syscalls only save upto here*/
105 unsigned long r11;
106 unsigned long r10;
107 unsigned long r9;
108 unsigned long r8;
109 unsigned long ax;
110 unsigned long cx;
111 unsigned long dx;
112 unsigned long si;
113 unsigned long di;
114 unsigned long orig_ax;
115/* end of arguments */
116/* cpu exception frame or undefined */
117 unsigned long ip;
118 unsigned long cs;
119 unsigned long flags;
120 unsigned long sp;
121 unsigned long ss;
122/* top of stack page */
123};
124
125#endif /* __KERNEL__ */
126#endif /* !__i386__ */
127
99#ifdef __KERNEL__ 128#ifdef __KERNEL__
100 129
101#define user_mode(regs) (!!((regs)->cs & 3)) 130/* the DS BTS struct is used for ptrace as well */
102#define user_mode_vm(regs) user_mode(regs) 131#include <asm/ds.h>
103#define instruction_pointer(regs) ((regs)->rip) 132
104#define frame_pointer(regs) ((regs)->rbp) 133struct task_struct;
105#define stack_pointer(regs) ((regs)->rsp) 134
106#define regs_return_value(regs) ((regs)->rax) 135extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
107 136
108extern unsigned long profile_pc(struct pt_regs *regs); 137extern unsigned long profile_pc(struct pt_regs *regs);
138
139extern unsigned long
140convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
141
142#ifdef CONFIG_X86_32
143extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
144#else
109void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 145void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
146#endif
110 147
111struct task_struct; 148#define regs_return_value(regs) ((regs)->ax)
149
150/*
151 * user_mode_vm(regs) determines whether a register set came from user mode.
152 * This is true if V8086 mode was enabled OR if the register set was from
153 * protected mode with RPL-3 CS value. This tricky test checks that with
154 * one comparison. Many places in the kernel can bypass this full check
155 * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
156 */
157static inline int user_mode(struct pt_regs *regs)
158{
159#ifdef CONFIG_X86_32
160 return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
161#else
162 return !!(regs->cs & 3);
163#endif
164}
165
166static inline int user_mode_vm(struct pt_regs *regs)
167{
168#ifdef CONFIG_X86_32
169 return ((regs->cs & SEGMENT_RPL_MASK) |
170 (regs->flags & VM_MASK)) >= USER_RPL;
171#else
172 return user_mode(regs);
173#endif
174}
175
176static inline int v8086_mode(struct pt_regs *regs)
177{
178#ifdef CONFIG_X86_32
179 return (regs->flags & VM_MASK);
180#else
181 return 0; /* No V86 mode support in long mode */
182#endif
183}
184
185/*
186 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
187 * when it traps. So regs will be the current sp.
188 *
189 * This is valid only for kernel mode traps.
190 */
191static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
192{
193#ifdef CONFIG_X86_32
194 return (unsigned long)regs;
195#else
196 return regs->sp;
197#endif
198}
199
200static inline unsigned long instruction_pointer(struct pt_regs *regs)
201{
202 return regs->ip;
203}
204
205static inline unsigned long frame_pointer(struct pt_regs *regs)
206{
207 return regs->bp;
208}
209
210/*
211 * These are defined as per linux/ptrace.h, which see.
212 */
213#define arch_has_single_step() (1)
214extern void user_enable_single_step(struct task_struct *);
215extern void user_disable_single_step(struct task_struct *);
216
217extern void user_enable_block_step(struct task_struct *);
218#ifdef CONFIG_X86_DEBUGCTLMSR
219#define arch_has_block_step() (1)
220#else
221#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
222#endif
223
224struct user_desc;
225extern int do_get_thread_area(struct task_struct *p, int idx,
226 struct user_desc __user *info);
227extern int do_set_thread_area(struct task_struct *p, int idx,
228 struct user_desc __user *info, int can_allocate);
112 229
113extern unsigned long
114convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
115
116enum {
117 EF_CF = 0x00000001,
118 EF_PF = 0x00000004,
119 EF_AF = 0x00000010,
120 EF_ZF = 0x00000040,
121 EF_SF = 0x00000080,
122 EF_TF = 0x00000100,
123 EF_IE = 0x00000200,
124 EF_DF = 0x00000400,
125 EF_OF = 0x00000800,
126 EF_IOPL = 0x00003000,
127 EF_IOPL_RING0 = 0x00000000,
128 EF_IOPL_RING1 = 0x00001000,
129 EF_IOPL_RING2 = 0x00002000,
130 EF_NT = 0x00004000, /* nested task */
131 EF_RF = 0x00010000, /* resume */
132 EF_VM = 0x00020000, /* virtual mode */
133 EF_AC = 0x00040000, /* alignment */
134 EF_VIF = 0x00080000, /* virtual interrupt */
135 EF_VIP = 0x00100000, /* virtual interrupt pending */
136 EF_ID = 0x00200000, /* id */
137};
138#endif /* __KERNEL__ */ 230#endif /* __KERNEL__ */
139#endif /* !__i386__ */ 231
140#endif /* !__ASSEMBLY__ */ 232#endif /* !__ASSEMBLY__ */
141 233
142#endif 234#endif
diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h
index 9b6dd093a9f7..46f725b0bc82 100644
--- a/include/asm-x86/resume-trace.h
+++ b/include/asm-x86/resume-trace.h
@@ -1,5 +1,20 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_RESUME_TRACE_H
2# include "resume-trace_32.h" 2#define _ASM_X86_RESUME_TRACE_H
3#else 3
4# include "resume-trace_64.h" 4#include <asm/asm.h>
5
6#define TRACE_RESUME(user) do { \
7 if (pm_trace_enabled) { \
8 void *tracedata; \
9 asm volatile(_ASM_MOV_UL " $1f,%0\n" \
10 ".section .tracedata,\"a\"\n" \
11 "1:\t.word %c1\n\t" \
12 _ASM_PTR " %c2\n" \
13 ".previous" \
14 :"=r" (tracedata) \
15 : "i" (__LINE__), "i" (__FILE__)); \
16 generate_resume_trace(tracedata, user); \
17 } \
18} while (0)
19
5#endif 20#endif
diff --git a/include/asm-x86/resume-trace_32.h b/include/asm-x86/resume-trace_32.h
deleted file mode 100644
index ec9cfd656230..000000000000
--- a/include/asm-x86/resume-trace_32.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#define TRACE_RESUME(user) do { \
2 if (pm_trace_enabled) { \
3 void *tracedata; \
4 asm volatile("movl $1f,%0\n" \
5 ".section .tracedata,\"a\"\n" \
6 "1:\t.word %c1\n" \
7 "\t.long %c2\n" \
8 ".previous" \
9 :"=r" (tracedata) \
10 : "i" (__LINE__), "i" (__FILE__)); \
11 generate_resume_trace(tracedata, user); \
12 } \
13} while (0)
diff --git a/include/asm-x86/resume-trace_64.h b/include/asm-x86/resume-trace_64.h
deleted file mode 100644
index 34bf998fdf62..000000000000
--- a/include/asm-x86/resume-trace_64.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#define TRACE_RESUME(user) do { \
2 if (pm_trace_enabled) { \
3 void *tracedata; \
4 asm volatile("movq $1f,%0\n" \
5 ".section .tracedata,\"a\"\n" \
6 "1:\t.word %c1\n" \
7 "\t.quad %c2\n" \
8 ".previous" \
9 :"=r" (tracedata) \
10 : "i" (__LINE__), "i" (__FILE__)); \
11 generate_resume_trace(tracedata, user); \
12 } \
13} while (0)
diff --git a/include/asm-x86/rio.h b/include/asm-x86/rio.h
index c7350f6d2015..97cdcc9887ba 100644
--- a/include/asm-x86/rio.h
+++ b/include/asm-x86/rio.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Derived from include/asm-i386/mach-summit/mach_mpparse.h 2 * Derived from include/asm-x86/mach-summit/mach_mpparse.h
3 * and include/asm-i386/mach-default/bios_ebda.h 3 * and include/asm-x86/mach-default/bios_ebda.h
4 * 4 *
5 * Author: Laurent Vivier <Laurent.Vivier@bull.net> 5 * Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 */ 6 */
diff --git a/include/asm-x86/rwlock.h b/include/asm-x86/rwlock.h
index f2b64a429e6b..6a8c0d645108 100644
--- a/include/asm-x86/rwlock.h
+++ b/include/asm-x86/rwlock.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_RWLOCK_H 2#define _ASM_X86_RWLOCK_H
3 3
4#define RW_LOCK_BIAS 0x01000000 4#define RW_LOCK_BIAS 0x01000000
5#define RW_LOCK_BIAS_STR "0x01000000"
6 5
7/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */ 6/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
8 7
diff --git a/include/asm-x86/rwsem.h b/include/asm-x86/rwsem.h
index 041906f3c6df..520a379f4b80 100644
--- a/include/asm-x86/rwsem.h
+++ b/include/asm-x86/rwsem.h
@@ -2,7 +2,7 @@
2 * 2 *
3 * Written by David Howells (dhowells@redhat.com). 3 * Written by David Howells (dhowells@redhat.com).
4 * 4 *
5 * Derived from asm-i386/semaphore.h 5 * Derived from asm-x86/semaphore.h
6 * 6 *
7 * 7 *
8 * The MSW of the count is the negated number of active writers and waiting 8 * The MSW of the count is the negated number of active writers and waiting
@@ -44,10 +44,14 @@
44 44
45struct rwsem_waiter; 45struct rwsem_waiter;
46 46
47extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); 47extern asmregparm struct rw_semaphore *
48extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem)); 48 rwsem_down_read_failed(struct rw_semaphore *sem);
49extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); 49extern asmregparm struct rw_semaphore *
50extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem)); 50 rwsem_down_write_failed(struct rw_semaphore *sem);
51extern asmregparm struct rw_semaphore *
52 rwsem_wake(struct rw_semaphore *);
53extern asmregparm struct rw_semaphore *
54 rwsem_downgrade_wake(struct rw_semaphore *sem);
51 55
52/* 56/*
53 * the semaphore definition 57 * the semaphore definition
diff --git a/include/asm-x86/scatterlist.h b/include/asm-x86/scatterlist.h
index 3a1e76257a27..d13c197866d6 100644
--- a/include/asm-x86/scatterlist.h
+++ b/include/asm-x86/scatterlist.h
@@ -1,5 +1,35 @@
1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H
3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 unsigned int length;
13 dma_addr_t dma_address;
14#ifdef CONFIG_X86_64
15 unsigned int dma_length;
16#endif
17};
18
19#define ARCH_HAS_SG_CHAIN
20#define ISA_DMA_THRESHOLD (0x00ffffff)
21
22/*
23 * These macros should be used after a pci_map_sg call has been done
24 * to get bus addresses of each of the SG entries and their lengths.
25 * You should only work with the number of sg entries pci_map_sg
26 * returns.
27 */
28#define sg_dma_address(sg) ((sg)->dma_address)
1#ifdef CONFIG_X86_32 29#ifdef CONFIG_X86_32
2# include "scatterlist_32.h" 30# define sg_dma_len(sg) ((sg)->length)
3#else 31#else
4# include "scatterlist_64.h" 32# define sg_dma_len(sg) ((sg)->dma_length)
33#endif
34
5#endif 35#endif
diff --git a/include/asm-x86/scatterlist_32.h b/include/asm-x86/scatterlist_32.h
deleted file mode 100644
index 0e7d997a34be..000000000000
--- a/include/asm-x86/scatterlist_32.h
+++ /dev/null
@@ -1,28 +0,0 @@
1#ifndef _I386_SCATTERLIST_H
2#define _I386_SCATTERLIST_H
3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 dma_addr_t dma_address;
13 unsigned int length;
14};
15
16#define ARCH_HAS_SG_CHAIN
17
18/* These macros should be used after a pci_map_sg call has been done
19 * to get bus addresses of each of the SG entries and their lengths.
20 * You should only work with the number of sg entries pci_map_sg
21 * returns.
22 */
23#define sg_dma_address(sg) ((sg)->dma_address)
24#define sg_dma_len(sg) ((sg)->length)
25
26#define ISA_DMA_THRESHOLD (0x00ffffff)
27
28#endif /* !(_I386_SCATTERLIST_H) */
diff --git a/include/asm-x86/scatterlist_64.h b/include/asm-x86/scatterlist_64.h
deleted file mode 100644
index 1847c72befeb..000000000000
--- a/include/asm-x86/scatterlist_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef _X8664_SCATTERLIST_H
2#define _X8664_SCATTERLIST_H
3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 unsigned int length;
13 dma_addr_t dma_address;
14 unsigned int dma_length;
15};
16
17#define ARCH_HAS_SG_CHAIN
18
19#define ISA_DMA_THRESHOLD (0x00ffffff)
20
21/* These macros should be used after a pci_map_sg call has been done
22 * to get bus addresses of each of the SG entries and their lengths.
23 * You should only work with the number of sg entries pci_map_sg
24 * returns.
25 */
26#define sg_dma_address(sg) ((sg)->dma_address)
27#define sg_dma_len(sg) ((sg)->dma_length)
28
29#endif
diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h
index 605068280e28..23f0535fec61 100644
--- a/include/asm-x86/segment.h
+++ b/include/asm-x86/segment.h
@@ -1,5 +1,204 @@
1#ifndef _ASM_X86_SEGMENT_H_
2#define _ASM_X86_SEGMENT_H_
3
4/* Simple and small GDT entries for booting only */
5
6#define GDT_ENTRY_BOOT_CS 2
7#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
8
9#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
10#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
11
12#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
13#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
14
1#ifdef CONFIG_X86_32 15#ifdef CONFIG_X86_32
2# include "segment_32.h" 16/*
17 * The layout of the per-CPU GDT under Linux:
18 *
19 * 0 - null
20 * 1 - reserved
21 * 2 - reserved
22 * 3 - reserved
23 *
24 * 4 - unused <==== new cacheline
25 * 5 - unused
26 *
27 * ------- start of TLS (Thread-Local Storage) segments:
28 *
29 * 6 - TLS segment #1 [ glibc's TLS segment ]
30 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
31 * 8 - TLS segment #3
32 * 9 - reserved
33 * 10 - reserved
34 * 11 - reserved
35 *
36 * ------- start of kernel segments:
37 *
38 * 12 - kernel code segment <==== new cacheline
39 * 13 - kernel data segment
40 * 14 - default user CS
41 * 15 - default user DS
42 * 16 - TSS
43 * 17 - LDT
44 * 18 - PNPBIOS support (16->32 gate)
45 * 19 - PNPBIOS support
46 * 20 - PNPBIOS support
47 * 21 - PNPBIOS support
48 * 22 - PNPBIOS support
49 * 23 - APM BIOS support
50 * 24 - APM BIOS support
51 * 25 - APM BIOS support
52 *
53 * 26 - ESPFIX small SS
54 * 27 - per-cpu [ offset to per-cpu data area ]
55 * 28 - unused
56 * 29 - unused
57 * 30 - unused
58 * 31 - TSS for double fault handler
59 */
60#define GDT_ENTRY_TLS_MIN 6
61#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
62
63#define GDT_ENTRY_DEFAULT_USER_CS 14
64#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
65
66#define GDT_ENTRY_DEFAULT_USER_DS 15
67#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
68
69#define GDT_ENTRY_KERNEL_BASE 12
70
71#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
72#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
73
74#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
75#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
76
77#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
78#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
79
80#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
81#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
82
83#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
84#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
85
86#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
87#ifdef CONFIG_SMP
88#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
3#else 89#else
4# include "segment_64.h" 90#define __KERNEL_PERCPU 0
91#endif
92
93#define GDT_ENTRY_DOUBLEFAULT_TSS 31
94
95/*
96 * The GDT has 32 entries
97 */
98#define GDT_ENTRIES 32
99
100/* The PnP BIOS entries in the GDT */
101#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
102#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
103#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
104#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
105#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
106
107/* The PnP BIOS selectors */
108#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
109#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
110#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
111#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
112#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
113
114/* Bottom two bits of selector give the ring privilege level */
115#define SEGMENT_RPL_MASK 0x3
116/* Bit 2 is table indicator (LDT/GDT) */
117#define SEGMENT_TI_MASK 0x4
118
119/* User mode is privilege level 3 */
120#define USER_RPL 0x3
121/* LDT segment has TI set, GDT has it cleared */
122#define SEGMENT_LDT 0x4
123#define SEGMENT_GDT 0x0
124
125/*
126 * Matching rules for certain types of segments.
127 */
128
129/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
130#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
131
132/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
133#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
134
135/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
136#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
137
138
139#else
140#include <asm/cache.h>
141
142#define __KERNEL_CS 0x10
143#define __KERNEL_DS 0x18
144
145#define __KERNEL32_CS 0x08
146
147/*
148 * we cannot use the same code segment descriptor for user and kernel
149 * -- not even in the long flat mode, because of different DPL /kkeil
150 * The segment offset needs to contain a RPL. Grr. -AK
151 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
152 */
153
154#define __USER32_CS 0x23 /* 4*8+3 */
155#define __USER_DS 0x2b /* 5*8+3 */
156#define __USER_CS 0x33 /* 6*8+3 */
157#define __USER32_DS __USER_DS
158
159#define GDT_ENTRY_TSS 8 /* needs two entries */
160#define GDT_ENTRY_LDT 10 /* needs two entries */
161#define GDT_ENTRY_TLS_MIN 12
162#define GDT_ENTRY_TLS_MAX 14
163
164#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
165#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
166
167/* TLS indexes for 64bit - hardcoded in arch_prctl */
168#define FS_TLS 0
169#define GS_TLS 1
170
171#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
172#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
173
174#define GDT_ENTRIES 16
175
176#endif
177
178#ifndef CONFIG_PARAVIRT
179#define get_kernel_rpl() 0
180#endif
181
182/* User mode is privilege level 3 */
183#define USER_RPL 0x3
184/* LDT segment has TI set, GDT has it cleared */
185#define SEGMENT_LDT 0x4
186#define SEGMENT_GDT 0x0
187
188/* Bottom two bits of selector give the ring privilege level */
189#define SEGMENT_RPL_MASK 0x3
190/* Bit 2 is table indicator (LDT/GDT) */
191#define SEGMENT_TI_MASK 0x4
192
193#define IDT_ENTRIES 256
194#define GDT_SIZE (GDT_ENTRIES * 8)
195#define GDT_ENTRY_TLS_ENTRIES 3
196#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
197
198#ifdef __KERNEL__
199#ifndef __ASSEMBLY__
200extern const char early_idt_handlers[IDT_ENTRIES][10];
201#endif
202#endif
203
5#endif 204#endif
diff --git a/include/asm-x86/segment_32.h b/include/asm-x86/segment_32.h
deleted file mode 100644
index 597a47c2515f..000000000000
--- a/include/asm-x86/segment_32.h
+++ /dev/null
@@ -1,148 +0,0 @@
1#ifndef _ASM_SEGMENT_H
2#define _ASM_SEGMENT_H
3
4/*
5 * The layout of the per-CPU GDT under Linux:
6 *
7 * 0 - null
8 * 1 - reserved
9 * 2 - reserved
10 * 3 - reserved
11 *
12 * 4 - unused <==== new cacheline
13 * 5 - unused
14 *
15 * ------- start of TLS (Thread-Local Storage) segments:
16 *
17 * 6 - TLS segment #1 [ glibc's TLS segment ]
18 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
19 * 8 - TLS segment #3
20 * 9 - reserved
21 * 10 - reserved
22 * 11 - reserved
23 *
24 * ------- start of kernel segments:
25 *
26 * 12 - kernel code segment <==== new cacheline
27 * 13 - kernel data segment
28 * 14 - default user CS
29 * 15 - default user DS
30 * 16 - TSS
31 * 17 - LDT
32 * 18 - PNPBIOS support (16->32 gate)
33 * 19 - PNPBIOS support
34 * 20 - PNPBIOS support
35 * 21 - PNPBIOS support
36 * 22 - PNPBIOS support
37 * 23 - APM BIOS support
38 * 24 - APM BIOS support
39 * 25 - APM BIOS support
40 *
41 * 26 - ESPFIX small SS
42 * 27 - per-cpu [ offset to per-cpu data area ]
43 * 28 - unused
44 * 29 - unused
45 * 30 - unused
46 * 31 - TSS for double fault handler
47 */
48#define GDT_ENTRY_TLS_ENTRIES 3
49#define GDT_ENTRY_TLS_MIN 6
50#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
51
52#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
53
54#define GDT_ENTRY_DEFAULT_USER_CS 14
55#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
56
57#define GDT_ENTRY_DEFAULT_USER_DS 15
58#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
59
60#define GDT_ENTRY_KERNEL_BASE 12
61
62#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
63#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
64
65#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
66#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
67
68#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
69#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
70
71#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
72#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
73
74#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
75#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
76
77#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
78#ifdef CONFIG_SMP
79#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
80#else
81#define __KERNEL_PERCPU 0
82#endif
83
84#define GDT_ENTRY_DOUBLEFAULT_TSS 31
85
86/*
87 * The GDT has 32 entries
88 */
89#define GDT_ENTRIES 32
90#define GDT_SIZE (GDT_ENTRIES * 8)
91
92/* Simple and small GDT entries for booting only */
93
94#define GDT_ENTRY_BOOT_CS 2
95#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
96
97#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
98#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
99
100/* The PnP BIOS entries in the GDT */
101#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
102#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
103#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
104#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
105#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
106
107/* The PnP BIOS selectors */
108#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
109#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
110#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
111#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
112#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
113
114/*
115 * The interrupt descriptor table has room for 256 idt's,
116 * the global descriptor table is dependent on the number
117 * of tasks we can have..
118 */
119#define IDT_ENTRIES 256
120
121/* Bottom two bits of selector give the ring privilege level */
122#define SEGMENT_RPL_MASK 0x3
123/* Bit 2 is table indicator (LDT/GDT) */
124#define SEGMENT_TI_MASK 0x4
125
126/* User mode is privilege level 3 */
127#define USER_RPL 0x3
128/* LDT segment has TI set, GDT has it cleared */
129#define SEGMENT_LDT 0x4
130#define SEGMENT_GDT 0x0
131
132#ifndef CONFIG_PARAVIRT
133#define get_kernel_rpl() 0
134#endif
135/*
136 * Matching rules for certain types of segments.
137 */
138
139/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
140#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
141
142/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
143#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
144
145/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
146#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
147
148#endif
diff --git a/include/asm-x86/segment_64.h b/include/asm-x86/segment_64.h
deleted file mode 100644
index 04b8ab21328f..000000000000
--- a/include/asm-x86/segment_64.h
+++ /dev/null
@@ -1,53 +0,0 @@
1#ifndef _ASM_SEGMENT_H
2#define _ASM_SEGMENT_H
3
4#include <asm/cache.h>
5
6/* Simple and small GDT entries for booting only */
7
8#define GDT_ENTRY_BOOT_CS 2
9#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
10
11#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
12#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
13
14#define __KERNEL_CS 0x10
15#define __KERNEL_DS 0x18
16
17#define __KERNEL32_CS 0x08
18
19/*
20 * we cannot use the same code segment descriptor for user and kernel
21 * -- not even in the long flat mode, because of different DPL /kkeil
22 * The segment offset needs to contain a RPL. Grr. -AK
23 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
24 */
25
26#define __USER32_CS 0x23 /* 4*8+3 */
27#define __USER_DS 0x2b /* 5*8+3 */
28#define __USER_CS 0x33 /* 6*8+3 */
29#define __USER32_DS __USER_DS
30
31#define GDT_ENTRY_TSS 8 /* needs two entries */
32#define GDT_ENTRY_LDT 10 /* needs two entries */
33#define GDT_ENTRY_TLS_MIN 12
34#define GDT_ENTRY_TLS_MAX 14
35
36#define GDT_ENTRY_TLS_ENTRIES 3
37
38#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
39#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
40
41/* TLS indexes for 64bit - hardcoded in arch_prctl */
42#define FS_TLS 0
43#define GS_TLS 1
44
45#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
46#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
47
48#define IDT_ENTRIES 256
49#define GDT_ENTRIES 16
50#define GDT_SIZE (GDT_ENTRIES * 8)
51#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
52
53#endif
diff --git a/include/asm-x86/semaphore_32.h b/include/asm-x86/semaphore_32.h
index 835c1d751a9f..ac96d3804d0c 100644
--- a/include/asm-x86/semaphore_32.h
+++ b/include/asm-x86/semaphore_32.h
@@ -83,10 +83,10 @@ static inline void init_MUTEX_LOCKED (struct semaphore *sem)
83 sema_init(sem, 0); 83 sema_init(sem, 0);
84} 84}
85 85
86fastcall void __down_failed(void /* special register calling convention */); 86extern asmregparm void __down_failed(atomic_t *count_ptr);
87fastcall int __down_failed_interruptible(void /* params in registers */); 87extern asmregparm int __down_failed_interruptible(atomic_t *count_ptr);
88fastcall int __down_failed_trylock(void /* params in registers */); 88extern asmregparm int __down_failed_trylock(atomic_t *count_ptr);
89fastcall void __up_wakeup(void /* special register calling convention */); 89extern asmregparm void __up_wakeup(atomic_t *count_ptr);
90 90
91/* 91/*
92 * This is ugly, but we want the default case to fall through. 92 * This is ugly, but we want the default case to fall through.
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 24d786e07b49..071e054abd82 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -3,6 +3,13 @@
3 3
4#define COMMAND_LINE_SIZE 2048 4#define COMMAND_LINE_SIZE 2048
5 5
6#ifndef __ASSEMBLY__
7char *machine_specific_memory_setup(void);
8#ifndef CONFIG_PARAVIRT
9#define paravirt_post_allocator_init() do {} while (0)
10#endif
11#endif /* __ASSEMBLY__ */
12
6#ifdef __KERNEL__ 13#ifdef __KERNEL__
7 14
8#ifdef __i386__ 15#ifdef __i386__
@@ -51,9 +58,7 @@ void __init add_memory_region(unsigned long long start,
51 58
52extern unsigned long init_pg_tables_end; 59extern unsigned long init_pg_tables_end;
53 60
54#ifndef CONFIG_PARAVIRT 61
55#define paravirt_post_allocator_init() do {} while (0)
56#endif
57 62
58#endif /* __i386__ */ 63#endif /* __i386__ */
59#endif /* _SETUP */ 64#endif /* _SETUP */
diff --git a/include/asm-x86/sigcontext.h b/include/asm-x86/sigcontext.h
index c047f9dc3423..681deade5f00 100644
--- a/include/asm-x86/sigcontext.h
+++ b/include/asm-x86/sigcontext.h
@@ -63,20 +63,20 @@ struct sigcontext {
63 unsigned short fs, __fsh; 63 unsigned short fs, __fsh;
64 unsigned short es, __esh; 64 unsigned short es, __esh;
65 unsigned short ds, __dsh; 65 unsigned short ds, __dsh;
66 unsigned long edi; 66 unsigned long di;
67 unsigned long esi; 67 unsigned long si;
68 unsigned long ebp; 68 unsigned long bp;
69 unsigned long esp; 69 unsigned long sp;
70 unsigned long ebx; 70 unsigned long bx;
71 unsigned long edx; 71 unsigned long dx;
72 unsigned long ecx; 72 unsigned long cx;
73 unsigned long eax; 73 unsigned long ax;
74 unsigned long trapno; 74 unsigned long trapno;
75 unsigned long err; 75 unsigned long err;
76 unsigned long eip; 76 unsigned long ip;
77 unsigned short cs, __csh; 77 unsigned short cs, __csh;
78 unsigned long eflags; 78 unsigned long flags;
79 unsigned long esp_at_signal; 79 unsigned long sp_at_signal;
80 unsigned short ss, __ssh; 80 unsigned short ss, __ssh;
81 struct _fpstate __user * fpstate; 81 struct _fpstate __user * fpstate;
82 unsigned long oldmask; 82 unsigned long oldmask;
@@ -111,16 +111,16 @@ struct sigcontext {
111 unsigned long r13; 111 unsigned long r13;
112 unsigned long r14; 112 unsigned long r14;
113 unsigned long r15; 113 unsigned long r15;
114 unsigned long rdi; 114 unsigned long di;
115 unsigned long rsi; 115 unsigned long si;
116 unsigned long rbp; 116 unsigned long bp;
117 unsigned long rbx; 117 unsigned long bx;
118 unsigned long rdx; 118 unsigned long dx;
119 unsigned long rax; 119 unsigned long ax;
120 unsigned long rcx; 120 unsigned long cx;
121 unsigned long rsp; 121 unsigned long sp;
122 unsigned long rip; 122 unsigned long ip;
123 unsigned long eflags; /* RFLAGS */ 123 unsigned long flags;
124 unsigned short cs; 124 unsigned short cs;
125 unsigned short gs; 125 unsigned short gs;
126 unsigned short fs; 126 unsigned short fs;
diff --git a/include/asm-x86/sigcontext32.h b/include/asm-x86/sigcontext32.h
index 3d657038ab7c..6ffab4fd593a 100644
--- a/include/asm-x86/sigcontext32.h
+++ b/include/asm-x86/sigcontext32.h
@@ -48,20 +48,20 @@ struct sigcontext_ia32 {
48 unsigned short fs, __fsh; 48 unsigned short fs, __fsh;
49 unsigned short es, __esh; 49 unsigned short es, __esh;
50 unsigned short ds, __dsh; 50 unsigned short ds, __dsh;
51 unsigned int edi; 51 unsigned int di;
52 unsigned int esi; 52 unsigned int si;
53 unsigned int ebp; 53 unsigned int bp;
54 unsigned int esp; 54 unsigned int sp;
55 unsigned int ebx; 55 unsigned int bx;
56 unsigned int edx; 56 unsigned int dx;
57 unsigned int ecx; 57 unsigned int cx;
58 unsigned int eax; 58 unsigned int ax;
59 unsigned int trapno; 59 unsigned int trapno;
60 unsigned int err; 60 unsigned int err;
61 unsigned int eip; 61 unsigned int ip;
62 unsigned short cs, __csh; 62 unsigned short cs, __csh;
63 unsigned int eflags; 63 unsigned int flags;
64 unsigned int esp_at_signal; 64 unsigned int sp_at_signal;
65 unsigned short ss, __ssh; 65 unsigned short ss, __ssh;
66 unsigned int fpstate; /* really (struct _fpstate_ia32 *) */ 66 unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
67 unsigned int oldmask; 67 unsigned int oldmask;
diff --git a/include/asm-x86/signal.h b/include/asm-x86/signal.h
index 987a422a2c78..aee7eca585ab 100644
--- a/include/asm-x86/signal.h
+++ b/include/asm-x86/signal.h
@@ -245,21 +245,14 @@ static __inline__ int sigfindinword(unsigned long word)
245 245
246struct pt_regs; 246struct pt_regs;
247 247
248#define ptrace_signal_deliver(regs, cookie) \
249 do { \
250 if (current->ptrace & PT_DTRACE) { \
251 current->ptrace &= ~PT_DTRACE; \
252 (regs)->eflags &= ~TF_MASK; \
253 } \
254 } while (0)
255
256#else /* __i386__ */ 248#else /* __i386__ */
257 249
258#undef __HAVE_ARCH_SIG_BITOPS 250#undef __HAVE_ARCH_SIG_BITOPS
259 251
252#endif /* !__i386__ */
253
260#define ptrace_signal_deliver(regs, cookie) do { } while (0) 254#define ptrace_signal_deliver(regs, cookie) do { } while (0)
261 255
262#endif /* !__i386__ */
263#endif /* __KERNEL__ */ 256#endif /* __KERNEL__ */
264#endif /* __ASSEMBLY__ */ 257#endif /* __ASSEMBLY__ */
265 258
diff --git a/include/asm-x86/smp_32.h b/include/asm-x86/smp_32.h
index e10b7affdfe5..56152e312287 100644
--- a/include/asm-x86/smp_32.h
+++ b/include/asm-x86/smp_32.h
@@ -1,51 +1,41 @@
1#ifndef __ASM_SMP_H 1#ifndef __ASM_SMP_H
2#define __ASM_SMP_H 2#define __ASM_SMP_H
3 3
4#ifndef __ASSEMBLY__
5#include <linux/cpumask.h>
6#include <linux/init.h>
7
4/* 8/*
5 * We need the APIC definitions automatically as part of 'smp.h' 9 * We need the APIC definitions automatically as part of 'smp.h'
6 */ 10 */
7#ifndef __ASSEMBLY__ 11#ifdef CONFIG_X86_LOCAL_APIC
8#include <linux/kernel.h> 12# include <asm/mpspec.h>
9#include <linux/threads.h> 13# include <asm/apic.h>
10#include <linux/cpumask.h> 14# ifdef CONFIG_X86_IO_APIC
15# include <asm/io_apic.h>
16# endif
11#endif 17#endif
12 18
13#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) 19extern cpumask_t cpu_callout_map;
14#include <linux/bitops.h> 20extern cpumask_t cpu_callin_map;
15#include <asm/mpspec.h>
16#include <asm/apic.h>
17#ifdef CONFIG_X86_IO_APIC
18#include <asm/io_apic.h>
19#endif
20#endif
21 21
22#define BAD_APICID 0xFFu 22extern int smp_num_siblings;
23#ifdef CONFIG_SMP 23extern unsigned int num_processors;
24#ifndef __ASSEMBLY__
25 24
26/*
27 * Private routines/data
28 */
29
30extern void smp_alloc_memory(void); 25extern void smp_alloc_memory(void);
31extern int pic_mode; 26extern void lock_ipi_call_lock(void);
32extern int smp_num_siblings; 27extern void unlock_ipi_call_lock(void);
33DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
34DECLARE_PER_CPU(cpumask_t, cpu_core_map);
35 28
36extern void (*mtrr_hook) (void); 29extern void (*mtrr_hook) (void);
37extern void zap_low_mappings (void); 30extern void zap_low_mappings (void);
38extern void lock_ipi_call_lock(void);
39extern void unlock_ipi_call_lock(void);
40 31
41#define MAX_APICID 256
42extern u8 __initdata x86_cpu_to_apicid_init[]; 32extern u8 __initdata x86_cpu_to_apicid_init[];
43extern void *x86_cpu_to_apicid_ptr; 33extern void *x86_cpu_to_apicid_early_ptr;
44DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
45
46#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
47 34
48extern void set_cpu_sibling_map(int cpu); 35DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
36DECLARE_PER_CPU(cpumask_t, cpu_core_map);
37DECLARE_PER_CPU(u8, cpu_llc_id);
38DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
49 39
50#ifdef CONFIG_HOTPLUG_CPU 40#ifdef CONFIG_HOTPLUG_CPU
51extern void cpu_exit_clear(void); 41extern void cpu_exit_clear(void);
@@ -53,6 +43,9 @@ extern void cpu_uninit(void);
53extern void remove_siblinginfo(int cpu); 43extern void remove_siblinginfo(int cpu);
54#endif 44#endif
55 45
46/* Globals due to paravirt */
47extern void set_cpu_sibling_map(int cpu);
48
56struct smp_ops 49struct smp_ops
57{ 50{
58 void (*smp_prepare_boot_cpu)(void); 51 void (*smp_prepare_boot_cpu)(void);
@@ -67,6 +60,7 @@ struct smp_ops
67 int wait); 60 int wait);
68}; 61};
69 62
63#ifdef CONFIG_SMP
70extern struct smp_ops smp_ops; 64extern struct smp_ops smp_ops;
71 65
72static inline void smp_prepare_boot_cpu(void) 66static inline void smp_prepare_boot_cpu(void)
@@ -107,10 +101,12 @@ int native_cpu_up(unsigned int cpunum);
107void native_smp_cpus_done(unsigned int max_cpus); 101void native_smp_cpus_done(unsigned int max_cpus);
108 102
109#ifndef CONFIG_PARAVIRT 103#ifndef CONFIG_PARAVIRT
110#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ 104#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
111do { } while (0)
112#endif 105#endif
113 106
107extern int __cpu_disable(void);
108extern void __cpu_die(unsigned int cpu);
109
114/* 110/*
115 * This function is needed by all SMP systems. It must _always_ be valid 111 * This function is needed by all SMP systems. It must _always_ be valid
116 * from the initial startup. We map APIC_BASE very early in page_setup(), 112 * from the initial startup. We map APIC_BASE very early in page_setup(),
@@ -119,9 +115,11 @@ do { } while (0)
119DECLARE_PER_CPU(int, cpu_number); 115DECLARE_PER_CPU(int, cpu_number);
120#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) 116#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
121 117
122extern cpumask_t cpu_callout_map; 118#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
123extern cpumask_t cpu_callin_map; 119
124extern cpumask_t cpu_possible_map; 120extern int safe_smp_processor_id(void);
121
122void __cpuinit smp_store_cpu_info(int id);
125 123
126/* We don't mark CPUs online until __cpu_up(), so we need another measure */ 124/* We don't mark CPUs online until __cpu_up(), so we need another measure */
127static inline int num_booting_cpus(void) 125static inline int num_booting_cpus(void)
@@ -129,56 +127,39 @@ static inline int num_booting_cpus(void)
129 return cpus_weight(cpu_callout_map); 127 return cpus_weight(cpu_callout_map);
130} 128}
131 129
132extern int safe_smp_processor_id(void);
133extern int __cpu_disable(void);
134extern void __cpu_die(unsigned int cpu);
135extern unsigned int num_processors;
136
137void __cpuinit smp_store_cpu_info(int id);
138
139#endif /* !__ASSEMBLY__ */
140
141#else /* CONFIG_SMP */ 130#else /* CONFIG_SMP */
142 131
143#define safe_smp_processor_id() 0 132#define safe_smp_processor_id() 0
144#define cpu_physical_id(cpu) boot_cpu_physical_apicid 133#define cpu_physical_id(cpu) boot_cpu_physical_apicid
145 134
146#define NO_PROC_ID 0xFF /* No processor magic marker */ 135#endif /* !CONFIG_SMP */
147
148#endif /* CONFIG_SMP */
149
150#ifndef __ASSEMBLY__
151 136
152#ifdef CONFIG_X86_LOCAL_APIC 137#ifdef CONFIG_X86_LOCAL_APIC
153 138
154#ifdef APIC_DEFINITION 139static __inline int logical_smp_processor_id(void)
140{
141 /* we don't want to mark this access volatile - bad code generation */
142 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
143}
144
145# ifdef APIC_DEFINITION
155extern int hard_smp_processor_id(void); 146extern int hard_smp_processor_id(void);
156#else 147# else
157#include <mach_apicdef.h> 148# include <mach_apicdef.h>
158static inline int hard_smp_processor_id(void) 149static inline int hard_smp_processor_id(void)
159{ 150{
160 /* we don't want to mark this access volatile - bad code generation */ 151 /* we don't want to mark this access volatile - bad code generation */
161 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); 152 return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
162} 153}
163#endif /* APIC_DEFINITION */ 154# endif /* APIC_DEFINITION */
164 155
165#else /* CONFIG_X86_LOCAL_APIC */ 156#else /* CONFIG_X86_LOCAL_APIC */
166 157
167#ifndef CONFIG_SMP 158# ifndef CONFIG_SMP
168#define hard_smp_processor_id() 0 159# define hard_smp_processor_id() 0
169#endif 160# endif
170 161
171#endif /* CONFIG_X86_LOCAL_APIC */ 162#endif /* CONFIG_X86_LOCAL_APIC */
172 163
173extern u8 apicid_2_node[]; 164#endif /* !ASSEMBLY */
174
175#ifdef CONFIG_X86_LOCAL_APIC
176static __inline int logical_smp_processor_id(void)
177{
178 /* we don't want to mark this access volatile - bad code generation */
179 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
180}
181#endif
182#endif
183
184#endif 165#endif
diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
index ab612b0ff270..e0a75519ad21 100644
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -1,130 +1,101 @@
1#ifndef __ASM_SMP_H 1#ifndef __ASM_SMP_H
2#define __ASM_SMP_H 2#define __ASM_SMP_H
3 3
4/*
5 * We need the APIC definitions automatically as part of 'smp.h'
6 */
7#include <linux/threads.h>
8#include <linux/cpumask.h> 4#include <linux/cpumask.h>
9#include <linux/bitops.h>
10#include <linux/init.h> 5#include <linux/init.h>
11extern int disable_apic;
12 6
13#include <asm/mpspec.h> 7/*
8 * We need the APIC definitions automatically as part of 'smp.h'
9 */
14#include <asm/apic.h> 10#include <asm/apic.h>
15#include <asm/io_apic.h> 11#include <asm/io_apic.h>
16#include <asm/thread_info.h> 12#include <asm/mpspec.h>
17
18#ifdef CONFIG_SMP
19
20#include <asm/pda.h> 13#include <asm/pda.h>
14#include <asm/thread_info.h>
21 15
22struct pt_regs;
23
24extern cpumask_t cpu_present_mask;
25extern cpumask_t cpu_possible_map;
26extern cpumask_t cpu_online_map;
27extern cpumask_t cpu_callout_map; 16extern cpumask_t cpu_callout_map;
28extern cpumask_t cpu_initialized; 17extern cpumask_t cpu_initialized;
29 18
30/* 19extern int smp_num_siblings;
31 * Private routines/data 20extern unsigned int num_processors;
32 */ 21
33
34extern void smp_alloc_memory(void); 22extern void smp_alloc_memory(void);
35extern volatile unsigned long smp_invalidate_needed;
36extern void lock_ipi_call_lock(void); 23extern void lock_ipi_call_lock(void);
37extern void unlock_ipi_call_lock(void); 24extern void unlock_ipi_call_lock(void);
38extern int smp_num_siblings; 25
39extern void smp_send_reschedule(int cpu);
40extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), 26extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
41 void *info, int wait); 27 void *info, int wait);
42 28
43/* 29extern u16 __initdata x86_cpu_to_apicid_init[];
44 * cpu_sibling_map and cpu_core_map now live 30extern u16 __initdata x86_bios_cpu_apicid_init[];
45 * in the per cpu area 31extern void *x86_cpu_to_apicid_early_ptr;
46 * 32extern void *x86_bios_cpu_apicid_early_ptr;
47 * extern cpumask_t cpu_sibling_map[NR_CPUS]; 33
48 * extern cpumask_t cpu_core_map[NR_CPUS];
49 */
50DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
51DECLARE_PER_CPU(cpumask_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_t, cpu_core_map);
52DECLARE_PER_CPU(u8, cpu_llc_id); 36DECLARE_PER_CPU(u16, cpu_llc_id);
53 37DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
54#define SMP_TRAMPOLINE_BASE 0x6000 38DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
55
56/*
57 * On x86 all CPUs are mapped 1:1 to the APIC space.
58 * This simplifies scheduling and IPI sending and
59 * compresses data structures.
60 */
61 39
62static inline int num_booting_cpus(void) 40static inline int cpu_present_to_apicid(int mps_cpu)
63{ 41{
64 return cpus_weight(cpu_callout_map); 42 if (cpu_present(mps_cpu))
43 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
44 else
45 return BAD_APICID;
65} 46}
66 47
67#define raw_smp_processor_id() read_pda(cpunumber) 48#ifdef CONFIG_SMP
49
50#define SMP_TRAMPOLINE_BASE 0x6000
68 51
69extern int __cpu_disable(void); 52extern int __cpu_disable(void);
70extern void __cpu_die(unsigned int cpu); 53extern void __cpu_die(unsigned int cpu);
71extern void prefill_possible_map(void); 54extern void prefill_possible_map(void);
72extern unsigned num_processors;
73extern unsigned __cpuinitdata disabled_cpus; 55extern unsigned __cpuinitdata disabled_cpus;
74 56
75#define NO_PROC_ID 0xFF /* No processor magic marker */ 57#define raw_smp_processor_id() read_pda(cpunumber)
76 58#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
77#endif /* CONFIG_SMP */
78 59
79#define safe_smp_processor_id() smp_processor_id() 60#define stack_smp_processor_id() \
80 61 ({ \
81static inline int hard_smp_processor_id(void) 62 struct thread_info *ti; \
82{ 63 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
83 /* we don't want to mark this access volatile - bad code generation */ 64 ti->cpu; \
84 return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); 65})
85}
86 66
87/* 67/*
88 * Some lowlevel functions might want to know about 68 * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
89 * the real APIC ID <-> CPU # mapping. 69 * scheduling and IPI sending and compresses data structures.
90 */ 70 */
91extern u8 __initdata x86_cpu_to_apicid_init[]; 71static inline int num_booting_cpus(void)
92extern void *x86_cpu_to_apicid_ptr;
93DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
94extern u8 bios_cpu_apicid[];
95
96static inline int cpu_present_to_apicid(int mps_cpu)
97{ 72{
98 if (mps_cpu < NR_CPUS) 73 return cpus_weight(cpu_callout_map);
99 return (int)bios_cpu_apicid[mps_cpu];
100 else
101 return BAD_APICID;
102} 74}
103 75
104#ifndef CONFIG_SMP 76extern void smp_send_reschedule(int cpu);
77
78#else /* CONFIG_SMP */
79
80extern unsigned int boot_cpu_id;
81#define cpu_physical_id(cpu) boot_cpu_id
105#define stack_smp_processor_id() 0 82#define stack_smp_processor_id() 0
106#define cpu_logical_map(x) (x) 83
107#else 84#endif /* !CONFIG_SMP */
108#include <asm/thread_info.h> 85
109#define stack_smp_processor_id() \ 86#define safe_smp_processor_id() smp_processor_id()
110({ \
111 struct thread_info *ti; \
112 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
113 ti->cpu; \
114})
115#endif
116 87
117static __inline int logical_smp_processor_id(void) 88static __inline int logical_smp_processor_id(void)
118{ 89{
119 /* we don't want to mark this access volatile - bad code generation */ 90 /* we don't want to mark this access volatile - bad code generation */
120 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); 91 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
92}
93
94static inline int hard_smp_processor_id(void)
95{
96 /* we don't want to mark this access volatile - bad code generation */
97 return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
121} 98}
122 99
123#ifdef CONFIG_SMP
124#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
125#else
126extern unsigned int boot_cpu_id;
127#define cpu_physical_id(cpu) boot_cpu_id
128#endif /* !CONFIG_SMP */
129#endif 100#endif
130 101
diff --git a/include/asm-x86/sparsemem.h b/include/asm-x86/sparsemem.h
index 3f203b1d9ee8..fa58cd55411a 100644
--- a/include/asm-x86/sparsemem.h
+++ b/include/asm-x86/sparsemem.h
@@ -1,5 +1,34 @@
1#ifndef _ASM_X86_SPARSEMEM_H
2#define _ASM_X86_SPARSEMEM_H
3
4#ifdef CONFIG_SPARSEMEM
5/*
6 * generic non-linear memory support:
7 *
8 * 1) we will not split memory into more chunks than will fit into the flags
9 * field of the struct page
10 *
11 * SECTION_SIZE_BITS 2^n: size of each section
12 * MAX_PHYSADDR_BITS 2^n: max size of physical address space
13 * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
14 *
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "sparsemem_32.h" 18# ifdef CONFIG_X86_PAE
3#else 19# define SECTION_SIZE_BITS 30
4# include "sparsemem_64.h" 20# define MAX_PHYSADDR_BITS 36
21# define MAX_PHYSMEM_BITS 36
22# else
23# define SECTION_SIZE_BITS 26
24# define MAX_PHYSADDR_BITS 32
25# define MAX_PHYSMEM_BITS 32
26# endif
27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 40
30# define MAX_PHYSMEM_BITS 40
31#endif
32
33#endif /* CONFIG_SPARSEMEM */
5#endif 34#endif
diff --git a/include/asm-x86/sparsemem_32.h b/include/asm-x86/sparsemem_32.h
deleted file mode 100644
index cfeed990585f..000000000000
--- a/include/asm-x86/sparsemem_32.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _I386_SPARSEMEM_H
2#define _I386_SPARSEMEM_H
3#ifdef CONFIG_SPARSEMEM
4
5/*
6 * generic non-linear memory support:
7 *
8 * 1) we will not split memory into more chunks than will fit into the
9 * flags field of the struct page
10 */
11
12/*
13 * SECTION_SIZE_BITS 2^N: how big each section will be
14 * MAX_PHYSADDR_BITS 2^N: how much physical address space we have
15 * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space
16 */
17#ifdef CONFIG_X86_PAE
18#define SECTION_SIZE_BITS 30
19#define MAX_PHYSADDR_BITS 36
20#define MAX_PHYSMEM_BITS 36
21#else
22#define SECTION_SIZE_BITS 26
23#define MAX_PHYSADDR_BITS 32
24#define MAX_PHYSMEM_BITS 32
25#endif
26
27/* XXX: FIXME -- wli */
28#define kern_addr_valid(kaddr) (0)
29
30#endif /* CONFIG_SPARSEMEM */
31#endif /* _I386_SPARSEMEM_H */
diff --git a/include/asm-x86/sparsemem_64.h b/include/asm-x86/sparsemem_64.h
deleted file mode 100644
index dabb16714a71..000000000000
--- a/include/asm-x86/sparsemem_64.h
+++ /dev/null
@@ -1,26 +0,0 @@
1#ifndef _ASM_X86_64_SPARSEMEM_H
2#define _ASM_X86_64_SPARSEMEM_H 1
3
4#ifdef CONFIG_SPARSEMEM
5
6/*
7 * generic non-linear memory support:
8 *
9 * 1) we will not split memory into more chunks than will fit into the flags
10 * field of the struct page
11 *
12 * SECTION_SIZE_BITS 2^n: size of each section
13 * MAX_PHYSADDR_BITS 2^n: max size of physical address space
14 * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
15 *
16 */
17
18#define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
19#define MAX_PHYSADDR_BITS 40
20#define MAX_PHYSMEM_BITS 40
21
22extern int early_pfn_to_nid(unsigned long pfn);
23
24#endif /* CONFIG_SPARSEMEM */
25
26#endif /* _ASM_X86_64_SPARSEMEM_H */
diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index d74d85e71dcb..23804c1890ff 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -1,5 +1,296 @@
1#ifndef _X86_SPINLOCK_H_
2#define _X86_SPINLOCK_H_
3
4#include <asm/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h>
7#include <asm/processor.h>
8#include <linux/compiler.h>
9
10/*
11 * Your basic SMP spinlocks, allowing only a single CPU anywhere
12 *
13 * Simple spin lock operations. There are two variants, one clears IRQ's
14 * on the local processor, one does not.
15 *
16 * These are fair FIFO ticket locks, which are currently limited to 256
17 * CPUs.
18 *
19 * (the type definitions are in asm/spinlock_types.h)
20 */
21
1#ifdef CONFIG_X86_32 22#ifdef CONFIG_X86_32
2# include "spinlock_32.h" 23typedef char _slock_t;
24# define LOCK_INS_DEC "decb"
25# define LOCK_INS_XCH "xchgb"
26# define LOCK_INS_MOV "movb"
27# define LOCK_INS_CMP "cmpb"
28# define LOCK_PTR_REG "a"
3#else 29#else
4# include "spinlock_64.h" 30typedef int _slock_t;
31# define LOCK_INS_DEC "decl"
32# define LOCK_INS_XCH "xchgl"
33# define LOCK_INS_MOV "movl"
34# define LOCK_INS_CMP "cmpl"
35# define LOCK_PTR_REG "D"
36#endif
37
38#if defined(CONFIG_X86_32) && \
39 (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
40/*
41 * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
42 * (PPro errata 66, 92)
43 */
44# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
45#else
46# define UNLOCK_LOCK_PREFIX
47#endif
48
49/*
50 * Ticket locks are conceptually two parts, one indicating the current head of
51 * the queue, and the other indicating the current tail. The lock is acquired
52 * by atomically noting the tail and incrementing it by one (thus adding
53 * ourself to the queue and noting our position), then waiting until the head
54 * becomes equal to the the initial value of the tail.
55 *
56 * We use an xadd covering *both* parts of the lock, to increment the tail and
57 * also load the position of the head, which takes care of memory ordering
58 * issues and should be optimal for the uncontended case. Note the tail must be
59 * in the high part, because a wide xadd increment of the low part would carry
60 * up and contaminate the high part.
61 *
62 * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
63 * save some instructions and make the code more elegant. There really isn't
64 * much between them in performance though, especially as locks are out of line.
65 */
66#if (NR_CPUS < 256)
67static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
68{
69 int tmp = *(volatile signed int *)(&(lock)->slock);
70
71 return (((tmp >> 8) & 0xff) != (tmp & 0xff));
72}
73
74static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
75{
76 int tmp = *(volatile signed int *)(&(lock)->slock);
77
78 return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
79}
80
81static inline void __raw_spin_lock(raw_spinlock_t *lock)
82{
83 short inc = 0x0100;
84
85 __asm__ __volatile__ (
86 LOCK_PREFIX "xaddw %w0, %1\n"
87 "1:\t"
88 "cmpb %h0, %b0\n\t"
89 "je 2f\n\t"
90 "rep ; nop\n\t"
91 "movb %1, %b0\n\t"
92 /* don't need lfence here, because loads are in-order */
93 "jmp 1b\n"
94 "2:"
95 :"+Q" (inc), "+m" (lock->slock)
96 :
97 :"memory", "cc");
98}
99
100#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
101
102static inline int __raw_spin_trylock(raw_spinlock_t *lock)
103{
104 int tmp;
105 short new;
106
107 asm volatile(
108 "movw %2,%w0\n\t"
109 "cmpb %h0,%b0\n\t"
110 "jne 1f\n\t"
111 "movw %w0,%w1\n\t"
112 "incb %h1\n\t"
113 "lock ; cmpxchgw %w1,%2\n\t"
114 "1:"
115 "sete %b1\n\t"
116 "movzbl %b1,%0\n\t"
117 :"=&a" (tmp), "=Q" (new), "+m" (lock->slock)
118 :
119 : "memory", "cc");
120
121 return tmp;
122}
123
124static inline void __raw_spin_unlock(raw_spinlock_t *lock)
125{
126 __asm__ __volatile__(
127 UNLOCK_LOCK_PREFIX "incb %0"
128 :"+m" (lock->slock)
129 :
130 :"memory", "cc");
131}
132#else
133static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
134{
135 int tmp = *(volatile signed int *)(&(lock)->slock);
136
137 return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
138}
139
140static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
141{
142 int tmp = *(volatile signed int *)(&(lock)->slock);
143
144 return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
145}
146
147static inline void __raw_spin_lock(raw_spinlock_t *lock)
148{
149 int inc = 0x00010000;
150 int tmp;
151
152 __asm__ __volatile__ (
153 "lock ; xaddl %0, %1\n"
154 "movzwl %w0, %2\n\t"
155 "shrl $16, %0\n\t"
156 "1:\t"
157 "cmpl %0, %2\n\t"
158 "je 2f\n\t"
159 "rep ; nop\n\t"
160 "movzwl %1, %2\n\t"
161 /* don't need lfence here, because loads are in-order */
162 "jmp 1b\n"
163 "2:"
164 :"+Q" (inc), "+m" (lock->slock), "=r" (tmp)
165 :
166 :"memory", "cc");
167}
168
169#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
170
171static inline int __raw_spin_trylock(raw_spinlock_t *lock)
172{
173 int tmp;
174 int new;
175
176 asm volatile(
177 "movl %2,%0\n\t"
178 "movl %0,%1\n\t"
179 "roll $16, %0\n\t"
180 "cmpl %0,%1\n\t"
181 "jne 1f\n\t"
182 "addl $0x00010000, %1\n\t"
183 "lock ; cmpxchgl %1,%2\n\t"
184 "1:"
185 "sete %b1\n\t"
186 "movzbl %b1,%0\n\t"
187 :"=&a" (tmp), "=r" (new), "+m" (lock->slock)
188 :
189 : "memory", "cc");
190
191 return tmp;
192}
193
194static inline void __raw_spin_unlock(raw_spinlock_t *lock)
195{
196 __asm__ __volatile__(
197 UNLOCK_LOCK_PREFIX "incw %0"
198 :"+m" (lock->slock)
199 :
200 :"memory", "cc");
201}
202#endif
203
204static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
205{
206 while (__raw_spin_is_locked(lock))
207 cpu_relax();
208}
209
210/*
211 * Read-write spinlocks, allowing multiple readers
212 * but only one writer.
213 *
214 * NOTE! it is quite common to have readers in interrupts
215 * but no interrupt writers. For those circumstances we
216 * can "mix" irq-safe locks - any writer needs to get a
217 * irq-safe write-lock, but readers can get non-irqsafe
218 * read-locks.
219 *
220 * On x86, we implement read-write locks as a 32-bit counter
221 * with the high bit (sign) being the "contended" bit.
222 */
223
224/**
225 * read_can_lock - would read_trylock() succeed?
226 * @lock: the rwlock in question.
227 */
228static inline int __raw_read_can_lock(raw_rwlock_t *lock)
229{
230 return (int)(lock)->lock > 0;
231}
232
233/**
234 * write_can_lock - would write_trylock() succeed?
235 * @lock: the rwlock in question.
236 */
237static inline int __raw_write_can_lock(raw_rwlock_t *lock)
238{
239 return (lock)->lock == RW_LOCK_BIAS;
240}
241
242static inline void __raw_read_lock(raw_rwlock_t *rw)
243{
244 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
245 "jns 1f\n"
246 "call __read_lock_failed\n\t"
247 "1:\n"
248 ::LOCK_PTR_REG (rw) : "memory");
249}
250
251static inline void __raw_write_lock(raw_rwlock_t *rw)
252{
253 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
254 "jz 1f\n"
255 "call __write_lock_failed\n\t"
256 "1:\n"
257 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
258}
259
260static inline int __raw_read_trylock(raw_rwlock_t *lock)
261{
262 atomic_t *count = (atomic_t *)lock;
263
264 atomic_dec(count);
265 if (atomic_read(count) >= 0)
266 return 1;
267 atomic_inc(count);
268 return 0;
269}
270
271static inline int __raw_write_trylock(raw_rwlock_t *lock)
272{
273 atomic_t *count = (atomic_t *)lock;
274
275 if (atomic_sub_and_test(RW_LOCK_BIAS, count))
276 return 1;
277 atomic_add(RW_LOCK_BIAS, count);
278 return 0;
279}
280
281static inline void __raw_read_unlock(raw_rwlock_t *rw)
282{
283 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
284}
285
286static inline void __raw_write_unlock(raw_rwlock_t *rw)
287{
288 asm volatile(LOCK_PREFIX "addl %1, %0"
289 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
290}
291
292#define _raw_spin_relax(lock) cpu_relax()
293#define _raw_read_relax(lock) cpu_relax()
294#define _raw_write_relax(lock) cpu_relax()
295
5#endif 296#endif
diff --git a/include/asm-x86/spinlock_32.h b/include/asm-x86/spinlock_32.h
deleted file mode 100644
index d3bcebed60ca..000000000000
--- a/include/asm-x86/spinlock_32.h
+++ /dev/null
@@ -1,221 +0,0 @@
1#ifndef __ASM_SPINLOCK_H
2#define __ASM_SPINLOCK_H
3
4#include <asm/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h>
7#include <asm/processor.h>
8#include <linux/compiler.h>
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define CLI_STRING "cli"
14#define STI_STRING "sti"
15#define CLI_STI_CLOBBERS
16#define CLI_STI_INPUT_ARGS
17#endif /* CONFIG_PARAVIRT */
18
19/*
20 * Your basic SMP spinlocks, allowing only a single CPU anywhere
21 *
22 * Simple spin lock operations. There are two variants, one clears IRQ's
23 * on the local processor, one does not.
24 *
25 * We make no fairness assumptions. They have a cost.
26 *
27 * (the type definitions are in asm/spinlock_types.h)
28 */
29
30static inline int __raw_spin_is_locked(raw_spinlock_t *x)
31{
32 return *(volatile signed char *)(&(x)->slock) <= 0;
33}
34
35static inline void __raw_spin_lock(raw_spinlock_t *lock)
36{
37 asm volatile("\n1:\t"
38 LOCK_PREFIX " ; decb %0\n\t"
39 "jns 3f\n"
40 "2:\t"
41 "rep;nop\n\t"
42 "cmpb $0,%0\n\t"
43 "jle 2b\n\t"
44 "jmp 1b\n"
45 "3:\n\t"
46 : "+m" (lock->slock) : : "memory");
47}
48
49/*
50 * It is easier for the lock validator if interrupts are not re-enabled
51 * in the middle of a lock-acquire. This is a performance feature anyway
52 * so we turn it off:
53 *
54 * NOTE: there's an irqs-on section here, which normally would have to be
55 * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant.
56 */
57#ifndef CONFIG_PROVE_LOCKING
58static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
59{
60 asm volatile(
61 "\n1:\t"
62 LOCK_PREFIX " ; decb %[slock]\n\t"
63 "jns 5f\n"
64 "2:\t"
65 "testl $0x200, %[flags]\n\t"
66 "jz 4f\n\t"
67 STI_STRING "\n"
68 "3:\t"
69 "rep;nop\n\t"
70 "cmpb $0, %[slock]\n\t"
71 "jle 3b\n\t"
72 CLI_STRING "\n\t"
73 "jmp 1b\n"
74 "4:\t"
75 "rep;nop\n\t"
76 "cmpb $0, %[slock]\n\t"
77 "jg 1b\n\t"
78 "jmp 4b\n"
79 "5:\n\t"
80 : [slock] "+m" (lock->slock)
81 : [flags] "r" (flags)
82 CLI_STI_INPUT_ARGS
83 : "memory" CLI_STI_CLOBBERS);
84}
85#endif
86
87static inline int __raw_spin_trylock(raw_spinlock_t *lock)
88{
89 char oldval;
90 asm volatile(
91 "xchgb %b0,%1"
92 :"=q" (oldval), "+m" (lock->slock)
93 :"0" (0) : "memory");
94 return oldval > 0;
95}
96
97/*
98 * __raw_spin_unlock based on writing $1 to the low byte.
99 * This method works. Despite all the confusion.
100 * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
101 * (PPro errata 66, 92)
102 */
103
104#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
105
106static inline void __raw_spin_unlock(raw_spinlock_t *lock)
107{
108 asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory");
109}
110
111#else
112
113static inline void __raw_spin_unlock(raw_spinlock_t *lock)
114{
115 char oldval = 1;
116
117 asm volatile("xchgb %b0, %1"
118 : "=q" (oldval), "+m" (lock->slock)
119 : "0" (oldval) : "memory");
120}
121
122#endif
123
124static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
125{
126 while (__raw_spin_is_locked(lock))
127 cpu_relax();
128}
129
130/*
131 * Read-write spinlocks, allowing multiple readers
132 * but only one writer.
133 *
134 * NOTE! it is quite common to have readers in interrupts
135 * but no interrupt writers. For those circumstances we
136 * can "mix" irq-safe locks - any writer needs to get a
137 * irq-safe write-lock, but readers can get non-irqsafe
138 * read-locks.
139 *
140 * On x86, we implement read-write locks as a 32-bit counter
141 * with the high bit (sign) being the "contended" bit.
142 *
143 * The inline assembly is non-obvious. Think about it.
144 *
145 * Changed to use the same technique as rw semaphores. See
146 * semaphore.h for details. -ben
147 *
148 * the helpers are in arch/i386/kernel/semaphore.c
149 */
150
151/**
152 * read_can_lock - would read_trylock() succeed?
153 * @lock: the rwlock in question.
154 */
155static inline int __raw_read_can_lock(raw_rwlock_t *x)
156{
157 return (int)(x)->lock > 0;
158}
159
160/**
161 * write_can_lock - would write_trylock() succeed?
162 * @lock: the rwlock in question.
163 */
164static inline int __raw_write_can_lock(raw_rwlock_t *x)
165{
166 return (x)->lock == RW_LOCK_BIAS;
167}
168
169static inline void __raw_read_lock(raw_rwlock_t *rw)
170{
171 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
172 "jns 1f\n"
173 "call __read_lock_failed\n\t"
174 "1:\n"
175 ::"a" (rw) : "memory");
176}
177
178static inline void __raw_write_lock(raw_rwlock_t *rw)
179{
180 asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t"
181 "jz 1f\n"
182 "call __write_lock_failed\n\t"
183 "1:\n"
184 ::"a" (rw) : "memory");
185}
186
187static inline int __raw_read_trylock(raw_rwlock_t *lock)
188{
189 atomic_t *count = (atomic_t *)lock;
190 atomic_dec(count);
191 if (atomic_read(count) >= 0)
192 return 1;
193 atomic_inc(count);
194 return 0;
195}
196
197static inline int __raw_write_trylock(raw_rwlock_t *lock)
198{
199 atomic_t *count = (atomic_t *)lock;
200 if (atomic_sub_and_test(RW_LOCK_BIAS, count))
201 return 1;
202 atomic_add(RW_LOCK_BIAS, count);
203 return 0;
204}
205
206static inline void __raw_read_unlock(raw_rwlock_t *rw)
207{
208 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
209}
210
211static inline void __raw_write_unlock(raw_rwlock_t *rw)
212{
213 asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0"
214 : "+m" (rw->lock) : : "memory");
215}
216
217#define _raw_spin_relax(lock) cpu_relax()
218#define _raw_read_relax(lock) cpu_relax()
219#define _raw_write_relax(lock) cpu_relax()
220
221#endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86/spinlock_64.h b/include/asm-x86/spinlock_64.h
deleted file mode 100644
index 88bf981e73cf..000000000000
--- a/include/asm-x86/spinlock_64.h
+++ /dev/null
@@ -1,167 +0,0 @@
1#ifndef __ASM_SPINLOCK_H
2#define __ASM_SPINLOCK_H
3
4#include <asm/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h>
7#include <asm/processor.h>
8
9/*
10 * Your basic SMP spinlocks, allowing only a single CPU anywhere
11 *
12 * Simple spin lock operations. There are two variants, one clears IRQ's
13 * on the local processor, one does not.
14 *
15 * We make no fairness assumptions. They have a cost.
16 *
17 * (the type definitions are in asm/spinlock_types.h)
18 */
19
20static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
21{
22 return *(volatile signed int *)(&(lock)->slock) <= 0;
23}
24
25static inline void __raw_spin_lock(raw_spinlock_t *lock)
26{
27 asm volatile(
28 "\n1:\t"
29 LOCK_PREFIX " ; decl %0\n\t"
30 "jns 2f\n"
31 "3:\n"
32 "rep;nop\n\t"
33 "cmpl $0,%0\n\t"
34 "jle 3b\n\t"
35 "jmp 1b\n"
36 "2:\t" : "=m" (lock->slock) : : "memory");
37}
38
39/*
40 * Same as __raw_spin_lock, but reenable interrupts during spinning.
41 */
42#ifndef CONFIG_PROVE_LOCKING
43static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
44{
45 asm volatile(
46 "\n1:\t"
47 LOCK_PREFIX " ; decl %0\n\t"
48 "jns 5f\n"
49 "testl $0x200, %1\n\t" /* interrupts were disabled? */
50 "jz 4f\n\t"
51 "sti\n"
52 "3:\t"
53 "rep;nop\n\t"
54 "cmpl $0, %0\n\t"
55 "jle 3b\n\t"
56 "cli\n\t"
57 "jmp 1b\n"
58 "4:\t"
59 "rep;nop\n\t"
60 "cmpl $0, %0\n\t"
61 "jg 1b\n\t"
62 "jmp 4b\n"
63 "5:\n\t"
64 : "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
65}
66#endif
67
68static inline int __raw_spin_trylock(raw_spinlock_t *lock)
69{
70 int oldval;
71
72 asm volatile(
73 "xchgl %0,%1"
74 :"=q" (oldval), "=m" (lock->slock)
75 :"0" (0) : "memory");
76
77 return oldval > 0;
78}
79
80static inline void __raw_spin_unlock(raw_spinlock_t *lock)
81{
82 asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory");
83}
84
85static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
86{
87 while (__raw_spin_is_locked(lock))
88 cpu_relax();
89}
90
91/*
92 * Read-write spinlocks, allowing multiple readers
93 * but only one writer.
94 *
95 * NOTE! it is quite common to have readers in interrupts
96 * but no interrupt writers. For those circumstances we
97 * can "mix" irq-safe locks - any writer needs to get a
98 * irq-safe write-lock, but readers can get non-irqsafe
99 * read-locks.
100 *
101 * On x86, we implement read-write locks as a 32-bit counter
102 * with the high bit (sign) being the "contended" bit.
103 */
104
105static inline int __raw_read_can_lock(raw_rwlock_t *lock)
106{
107 return (int)(lock)->lock > 0;
108}
109
110static inline int __raw_write_can_lock(raw_rwlock_t *lock)
111{
112 return (lock)->lock == RW_LOCK_BIAS;
113}
114
115static inline void __raw_read_lock(raw_rwlock_t *rw)
116{
117 asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t"
118 "jns 1f\n"
119 "call __read_lock_failed\n"
120 "1:\n"
121 ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory");
122}
123
124static inline void __raw_write_lock(raw_rwlock_t *rw)
125{
126 asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t"
127 "jz 1f\n"
128 "\tcall __write_lock_failed\n\t"
129 "1:\n"
130 ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory");
131}
132
133static inline int __raw_read_trylock(raw_rwlock_t *lock)
134{
135 atomic_t *count = (atomic_t *)lock;
136 atomic_dec(count);
137 if (atomic_read(count) >= 0)
138 return 1;
139 atomic_inc(count);
140 return 0;
141}
142
143static inline int __raw_write_trylock(raw_rwlock_t *lock)
144{
145 atomic_t *count = (atomic_t *)lock;
146 if (atomic_sub_and_test(RW_LOCK_BIAS, count))
147 return 1;
148 atomic_add(RW_LOCK_BIAS, count);
149 return 0;
150}
151
152static inline void __raw_read_unlock(raw_rwlock_t *rw)
153{
154 asm volatile(LOCK_PREFIX " ; incl %0" :"=m" (rw->lock) : : "memory");
155}
156
157static inline void __raw_write_unlock(raw_rwlock_t *rw)
158{
159 asm volatile(LOCK_PREFIX " ; addl $" RW_LOCK_BIAS_STR ",%0"
160 : "=m" (rw->lock) : : "memory");
161}
162
163#define _raw_spin_relax(lock) cpu_relax()
164#define _raw_read_relax(lock) cpu_relax()
165#define _raw_write_relax(lock) cpu_relax()
166
167#endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86/spinlock_types.h b/include/asm-x86/spinlock_types.h
index 4da9345c1500..9029cf78cf5d 100644
--- a/include/asm-x86/spinlock_types.h
+++ b/include/asm-x86/spinlock_types.h
@@ -9,7 +9,7 @@ typedef struct {
9 unsigned int slock; 9 unsigned int slock;
10} raw_spinlock_t; 10} raw_spinlock_t;
11 11
12#define __RAW_SPIN_LOCK_UNLOCKED { 1 } 12#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
13 13
14typedef struct { 14typedef struct {
15 unsigned int lock; 15 unsigned int lock;
diff --git a/include/asm-x86/stacktrace.h b/include/asm-x86/stacktrace.h
index 70dd5bae3235..30f82526a8e2 100644
--- a/include/asm-x86/stacktrace.h
+++ b/include/asm-x86/stacktrace.h
@@ -9,12 +9,13 @@ struct stacktrace_ops {
9 void (*warning)(void *data, char *msg); 9 void (*warning)(void *data, char *msg);
10 /* msg must contain %s for the symbol */ 10 /* msg must contain %s for the symbol */
11 void (*warning_symbol)(void *data, char *msg, unsigned long symbol); 11 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
12 void (*address)(void *data, unsigned long address); 12 void (*address)(void *data, unsigned long address, int reliable);
13 /* On negative return stop dumping */ 13 /* On negative return stop dumping */
14 int (*stack)(void *data, char *name); 14 int (*stack)(void *data, char *name);
15}; 15};
16 16
17void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, 17void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
18 unsigned long *stack, unsigned long bp,
18 const struct stacktrace_ops *ops, void *data); 19 const struct stacktrace_ops *ops, void *data);
19 20
20#endif 21#endif
diff --git a/include/asm-x86/suspend_32.h b/include/asm-x86/suspend_32.h
index a2520732ffd6..1bbda3ad7796 100644
--- a/include/asm-x86/suspend_32.h
+++ b/include/asm-x86/suspend_32.h
@@ -12,8 +12,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
12struct saved_context { 12struct saved_context {
13 u16 es, fs, gs, ss; 13 u16 es, fs, gs, ss;
14 unsigned long cr0, cr2, cr3, cr4; 14 unsigned long cr0, cr2, cr3, cr4;
15 struct Xgt_desc_struct gdt; 15 struct desc_ptr gdt;
16 struct Xgt_desc_struct idt; 16 struct desc_ptr idt;
17 u16 ldt; 17 u16 ldt;
18 u16 tss; 18 u16 tss;
19 unsigned long tr; 19 unsigned long tr;
diff --git a/include/asm-x86/suspend_64.h b/include/asm-x86/suspend_64.h
index c505a76bcf6e..2eb92cb81a0d 100644
--- a/include/asm-x86/suspend_64.h
+++ b/include/asm-x86/suspend_64.h
@@ -15,7 +15,14 @@ arch_prepare_suspend(void)
15 return 0; 15 return 0;
16} 16}
17 17
18/* Image of the saved processor state. If you touch this, fix acpi/wakeup.S. */ 18/*
19 * Image of the saved processor state, used by the low level ACPI suspend to
20 * RAM code and by the low level hibernation code.
21 *
22 * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
23 * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
24 * still work as required.
25 */
19struct saved_context { 26struct saved_context {
20 struct pt_regs regs; 27 struct pt_regs regs;
21 u16 ds, es, fs, gs, ss; 28 u16 ds, es, fs, gs, ss;
@@ -38,8 +45,6 @@ struct saved_context {
38#define loaddebug(thread,register) \ 45#define loaddebug(thread,register) \
39 set_debugreg((thread)->debugreg##register, register) 46 set_debugreg((thread)->debugreg##register, register)
40 47
41extern void fix_processor_context(void);
42
43/* routines for saving/restoring kernel state */ 48/* routines for saving/restoring kernel state */
44extern int acpi_save_state_mem(void); 49extern int acpi_save_state_mem(void);
45extern char core_restore_code; 50extern char core_restore_code;
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index 692562b48f2a..ee32ef9367f4 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -1,5 +1,414 @@
1#ifndef _ASM_X86_SYSTEM_H_
2#define _ASM_X86_SYSTEM_H_
3
4#include <asm/asm.h>
5#include <asm/segment.h>
6#include <asm/cpufeature.h>
7#include <asm/cmpxchg.h>
8#include <asm/nops.h>
9
10#include <linux/kernel.h>
11#include <linux/irqflags.h>
12
13/* entries in ARCH_DLINFO: */
14#ifdef CONFIG_IA32_EMULATION
15# define AT_VECTOR_SIZE_ARCH 2
16#else
17# define AT_VECTOR_SIZE_ARCH 1
18#endif
19
20#ifdef CONFIG_X86_32
21
22struct task_struct; /* one of the stranger aspects of C forward declarations */
23extern struct task_struct *FASTCALL(__switch_to(struct task_struct *prev,
24 struct task_struct *next));
25
26/*
27 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc.
29 */
30#define switch_to(prev, next, last) do { \
31 unsigned long esi, edi; \
32 asm volatile("pushfl\n\t" /* Save flags */ \
33 "pushl %%ebp\n\t" \
34 "movl %%esp,%0\n\t" /* save ESP */ \
35 "movl %5,%%esp\n\t" /* restore ESP */ \
36 "movl $1f,%1\n\t" /* save EIP */ \
37 "pushl %6\n\t" /* restore EIP */ \
38 "jmp __switch_to\n" \
39 "1:\t" \
40 "popl %%ebp\n\t" \
41 "popfl" \
42 :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
43 "=a" (last), "=S" (esi), "=D" (edi) \
44 :"m" (next->thread.sp), "m" (next->thread.ip), \
45 "2" (prev), "d" (next)); \
46} while (0)
47
48/*
49 * disable hlt during certain critical i/o operations
50 */
51#define HAVE_DISABLE_HLT
52#else
53#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
54#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
55
56/* frame pointer must be last for get_wchan */
57#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
58#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
59
60#define __EXTRA_CLOBBER \
61 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
62 "r12", "r13", "r14", "r15"
63
64/* Save restore flags to clear handle leaking NT */
65#define switch_to(prev, next, last) \
66 asm volatile(SAVE_CONTEXT \
67 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
68 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
69 "call __switch_to\n\t" \
70 ".globl thread_return\n" \
71 "thread_return:\n\t" \
72 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
73 "movq %P[thread_info](%%rsi),%%r8\n\t" \
74 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
75 "movq %%rax,%%rdi\n\t" \
76 "jc ret_from_fork\n\t" \
77 RESTORE_CONTEXT \
78 : "=a" (last) \
79 : [next] "S" (next), [prev] "D" (prev), \
80 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
81 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
82 [tif_fork] "i" (TIF_FORK), \
83 [thread_info] "i" (offsetof(struct task_struct, stack)), \
84 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
85 : "memory", "cc" __EXTRA_CLOBBER)
86#endif
87
88#ifdef __KERNEL__
89#define _set_base(addr, base) do { unsigned long __pr; \
90__asm__ __volatile__ ("movw %%dx,%1\n\t" \
91 "rorl $16,%%edx\n\t" \
92 "movb %%dl,%2\n\t" \
93 "movb %%dh,%3" \
94 :"=&d" (__pr) \
95 :"m" (*((addr)+2)), \
96 "m" (*((addr)+4)), \
97 "m" (*((addr)+7)), \
98 "0" (base) \
99 ); } while (0)
100
101#define _set_limit(addr, limit) do { unsigned long __lr; \
102__asm__ __volatile__ ("movw %%dx,%1\n\t" \
103 "rorl $16,%%edx\n\t" \
104 "movb %2,%%dh\n\t" \
105 "andb $0xf0,%%dh\n\t" \
106 "orb %%dh,%%dl\n\t" \
107 "movb %%dl,%2" \
108 :"=&d" (__lr) \
109 :"m" (*(addr)), \
110 "m" (*((addr)+6)), \
111 "0" (limit) \
112 ); } while (0)
113
114#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
115#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
116
117extern void load_gs_index(unsigned);
118
119/*
120 * Load a segment. Fall back on loading the zero
121 * segment if something goes wrong..
122 */
123#define loadsegment(seg, value) \
124 asm volatile("\n" \
125 "1:\t" \
126 "movl %k0,%%" #seg "\n" \
127 "2:\n" \
128 ".section .fixup,\"ax\"\n" \
129 "3:\t" \
130 "movl %k1, %%" #seg "\n\t" \
131 "jmp 2b\n" \
132 ".previous\n" \
133 ".section __ex_table,\"a\"\n\t" \
134 _ASM_ALIGN "\n\t" \
135 _ASM_PTR " 1b,3b\n" \
136 ".previous" \
137 : :"r" (value), "r" (0))
138
139
140/*
141 * Save a segment register away
142 */
143#define savesegment(seg, value) \
144 asm volatile("mov %%" #seg ",%0":"=rm" (value))
145
146static inline unsigned long get_limit(unsigned long segment)
147{
148 unsigned long __limit;
149 __asm__("lsll %1,%0"
150 :"=r" (__limit):"r" (segment));
151 return __limit+1;
152}
153
154static inline void native_clts(void)
155{
156 asm volatile ("clts");
157}
158
159/*
160 * Volatile isn't enough to prevent the compiler from reordering the
161 * read/write functions for the control registers and messing everything up.
162 * A memory clobber would solve the problem, but would prevent reordering of
163 * all loads stores around it, which can hurt performance. Solution is to
164 * use a variable and mimic reads and writes to it to enforce serialization
165 */
166static unsigned long __force_order;
167
168static inline unsigned long native_read_cr0(void)
169{
170 unsigned long val;
171 asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
172 return val;
173}
174
175static inline void native_write_cr0(unsigned long val)
176{
177 asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
178}
179
180static inline unsigned long native_read_cr2(void)
181{
182 unsigned long val;
183 asm volatile("mov %%cr2,%0\n\t" :"=r" (val), "=m" (__force_order));
184 return val;
185}
186
187static inline void native_write_cr2(unsigned long val)
188{
189 asm volatile("mov %0,%%cr2": :"r" (val), "m" (__force_order));
190}
191
192static inline unsigned long native_read_cr3(void)
193{
194 unsigned long val;
195 asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
196 return val;
197}
198
199static inline void native_write_cr3(unsigned long val)
200{
201 asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
202}
203
204static inline unsigned long native_read_cr4(void)
205{
206 unsigned long val;
207 asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
208 return val;
209}
210
211static inline unsigned long native_read_cr4_safe(void)
212{
213 unsigned long val;
214 /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
215 * exists, so it will never fail. */
216#ifdef CONFIG_X86_32
217 asm volatile("1: mov %%cr4, %0 \n"
218 "2: \n"
219 ".section __ex_table,\"a\" \n"
220 ".long 1b,2b \n"
221 ".previous \n"
222 : "=r" (val), "=m" (__force_order) : "0" (0));
223#else
224 val = native_read_cr4();
225#endif
226 return val;
227}
228
229static inline void native_write_cr4(unsigned long val)
230{
231 asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
232}
233
234#ifdef CONFIG_X86_64
235static inline unsigned long native_read_cr8(void)
236{
237 unsigned long cr8;
238 asm volatile("movq %%cr8,%0" : "=r" (cr8));
239 return cr8;
240}
241
242static inline void native_write_cr8(unsigned long val)
243{
244 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
245}
246#endif
247
248static inline void native_wbinvd(void)
249{
250 asm volatile("wbinvd": : :"memory");
251}
252#ifdef CONFIG_PARAVIRT
253#include <asm/paravirt.h>
254#else
255#define read_cr0() (native_read_cr0())
256#define write_cr0(x) (native_write_cr0(x))
257#define read_cr2() (native_read_cr2())
258#define write_cr2(x) (native_write_cr2(x))
259#define read_cr3() (native_read_cr3())
260#define write_cr3(x) (native_write_cr3(x))
261#define read_cr4() (native_read_cr4())
262#define read_cr4_safe() (native_read_cr4_safe())
263#define write_cr4(x) (native_write_cr4(x))
264#define wbinvd() (native_wbinvd())
265#ifdef CONFIG_X86_64
266#define read_cr8() (native_read_cr8())
267#define write_cr8(x) (native_write_cr8(x))
268#endif
269
270/* Clear the 'TS' bit */
271#define clts() (native_clts())
272
273#endif/* CONFIG_PARAVIRT */
274
275#define stts() write_cr0(8 | read_cr0())
276
277#endif /* __KERNEL__ */
278
279static inline void clflush(void *__p)
280{
281 asm volatile("clflush %0" : "+m" (*(char __force *)__p));
282}
283
284#define nop() __asm__ __volatile__ ("nop")
285
286void disable_hlt(void);
287void enable_hlt(void);
288
289extern int es7000_plat;
290void cpu_idle_wait(void);
291
292extern unsigned long arch_align_stack(unsigned long sp);
293extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
294
295void default_idle(void);
296
297/*
298 * Force strict CPU ordering.
299 * And yes, this is required on UP too when we're talking
300 * to devices.
301 */
1#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
2# include "system_32.h" 303/*
304 * For now, "wmb()" doesn't actually do anything, as all
305 * Intel CPU's follow what Intel calls a *Processor Order*,
306 * in which all writes are seen in the program order even
307 * outside the CPU.
308 *
309 * I expect future Intel CPU's to have a weaker ordering,
310 * but I'd also expect them to finally get their act together
311 * and add some real memory barriers if so.
312 *
313 * Some non intel clones support out of order store. wmb() ceases to be a
314 * nop for these.
315 */
316#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
317#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
318#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
3#else 319#else
4# include "system_64.h" 320#define mb() asm volatile("mfence":::"memory")
321#define rmb() asm volatile("lfence":::"memory")
322#define wmb() asm volatile("sfence" ::: "memory")
323#endif
324
325/**
326 * read_barrier_depends - Flush all pending reads that subsequents reads
327 * depend on.
328 *
329 * No data-dependent reads from memory-like regions are ever reordered
330 * over this barrier. All reads preceding this primitive are guaranteed
331 * to access memory (but not necessarily other CPUs' caches) before any
332 * reads following this primitive that depend on the data return by
333 * any of the preceding reads. This primitive is much lighter weight than
334 * rmb() on most CPUs, and is never heavier weight than is
335 * rmb().
336 *
337 * These ordering constraints are respected by both the local CPU
338 * and the compiler.
339 *
340 * Ordering is not guaranteed by anything other than these primitives,
341 * not even by data dependencies. See the documentation for
342 * memory_barrier() for examples and URLs to more information.
343 *
344 * For example, the following code would force ordering (the initial
345 * value of "a" is zero, "b" is one, and "p" is "&a"):
346 *
347 * <programlisting>
348 * CPU 0 CPU 1
349 *
350 * b = 2;
351 * memory_barrier();
352 * p = &b; q = p;
353 * read_barrier_depends();
354 * d = *q;
355 * </programlisting>
356 *
357 * because the read of "*q" depends on the read of "p" and these
358 * two reads are separated by a read_barrier_depends(). However,
359 * the following code, with the same initial values for "a" and "b":
360 *
361 * <programlisting>
362 * CPU 0 CPU 1
363 *
364 * a = 2;
365 * memory_barrier();
366 * b = 3; y = b;
367 * read_barrier_depends();
368 * x = a;
369 * </programlisting>
370 *
371 * does not enforce ordering, since there is no data dependency between
372 * the read of "a" and the read of "b". Therefore, on some CPUs, such
373 * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
374 * in cases like this where there are no data dependencies.
375 **/
376
377#define read_barrier_depends() do { } while (0)
378
379#ifdef CONFIG_SMP
380#define smp_mb() mb()
381#ifdef CONFIG_X86_PPRO_FENCE
382# define smp_rmb() rmb()
383#else
384# define smp_rmb() barrier()
385#endif
386#ifdef CONFIG_X86_OOSTORE
387# define smp_wmb() wmb()
388#else
389# define smp_wmb() barrier()
390#endif
391#define smp_read_barrier_depends() read_barrier_depends()
392#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
393#else
394#define smp_mb() barrier()
395#define smp_rmb() barrier()
396#define smp_wmb() barrier()
397#define smp_read_barrier_depends() do { } while (0)
398#define set_mb(var, value) do { var = value; barrier(); } while (0)
399#endif
400
401/*
402 * Stop RDTSC speculation. This is needed when you need to use RDTSC
403 * (or get_cycles or vread that possibly accesses the TSC) in a defined
404 * code region.
405 *
406 * (Could use an alternative three way for this if there was one.)
407 */
408static inline void rdtsc_barrier(void)
409{
410 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
411 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
412}
413
5#endif 414#endif
diff --git a/include/asm-x86/system_32.h b/include/asm-x86/system_32.h
deleted file mode 100644
index ef8468883bac..000000000000
--- a/include/asm-x86/system_32.h
+++ /dev/null
@@ -1,320 +0,0 @@
1#ifndef __ASM_SYSTEM_H
2#define __ASM_SYSTEM_H
3
4#include <linux/kernel.h>
5#include <asm/segment.h>
6#include <asm/cpufeature.h>
7#include <asm/cmpxchg.h>
8
9#ifdef __KERNEL__
10#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
11
12struct task_struct; /* one of the stranger aspects of C forward declarations.. */
13extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
14
15/*
16 * Saving eflags is important. It switches not only IOPL between tasks,
17 * it also protects other tasks from NT leaking through sysenter etc.
18 */
19#define switch_to(prev,next,last) do { \
20 unsigned long esi,edi; \
21 asm volatile("pushfl\n\t" /* Save flags */ \
22 "pushl %%ebp\n\t" \
23 "movl %%esp,%0\n\t" /* save ESP */ \
24 "movl %5,%%esp\n\t" /* restore ESP */ \
25 "movl $1f,%1\n\t" /* save EIP */ \
26 "pushl %6\n\t" /* restore EIP */ \
27 "jmp __switch_to\n" \
28 "1:\t" \
29 "popl %%ebp\n\t" \
30 "popfl" \
31 :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
32 "=a" (last),"=S" (esi),"=D" (edi) \
33 :"m" (next->thread.esp),"m" (next->thread.eip), \
34 "2" (prev), "d" (next)); \
35} while (0)
36
37#define _set_base(addr,base) do { unsigned long __pr; \
38__asm__ __volatile__ ("movw %%dx,%1\n\t" \
39 "rorl $16,%%edx\n\t" \
40 "movb %%dl,%2\n\t" \
41 "movb %%dh,%3" \
42 :"=&d" (__pr) \
43 :"m" (*((addr)+2)), \
44 "m" (*((addr)+4)), \
45 "m" (*((addr)+7)), \
46 "0" (base) \
47 ); } while(0)
48
49#define _set_limit(addr,limit) do { unsigned long __lr; \
50__asm__ __volatile__ ("movw %%dx,%1\n\t" \
51 "rorl $16,%%edx\n\t" \
52 "movb %2,%%dh\n\t" \
53 "andb $0xf0,%%dh\n\t" \
54 "orb %%dh,%%dl\n\t" \
55 "movb %%dl,%2" \
56 :"=&d" (__lr) \
57 :"m" (*(addr)), \
58 "m" (*((addr)+6)), \
59 "0" (limit) \
60 ); } while(0)
61
62#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
63#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
64
65/*
66 * Load a segment. Fall back on loading the zero
67 * segment if something goes wrong..
68 */
69#define loadsegment(seg,value) \
70 asm volatile("\n" \
71 "1:\t" \
72 "mov %0,%%" #seg "\n" \
73 "2:\n" \
74 ".section .fixup,\"ax\"\n" \
75 "3:\t" \
76 "pushl $0\n\t" \
77 "popl %%" #seg "\n\t" \
78 "jmp 2b\n" \
79 ".previous\n" \
80 ".section __ex_table,\"a\"\n\t" \
81 ".align 4\n\t" \
82 ".long 1b,3b\n" \
83 ".previous" \
84 : :"rm" (value))
85
86/*
87 * Save a segment register away
88 */
89#define savesegment(seg, value) \
90 asm volatile("mov %%" #seg ",%0":"=rm" (value))
91
92
93static inline void native_clts(void)
94{
95 asm volatile ("clts");
96}
97
98static inline unsigned long native_read_cr0(void)
99{
100 unsigned long val;
101 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
102 return val;
103}
104
105static inline void native_write_cr0(unsigned long val)
106{
107 asm volatile("movl %0,%%cr0": :"r" (val));
108}
109
110static inline unsigned long native_read_cr2(void)
111{
112 unsigned long val;
113 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
114 return val;
115}
116
117static inline void native_write_cr2(unsigned long val)
118{
119 asm volatile("movl %0,%%cr2": :"r" (val));
120}
121
122static inline unsigned long native_read_cr3(void)
123{
124 unsigned long val;
125 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
126 return val;
127}
128
129static inline void native_write_cr3(unsigned long val)
130{
131 asm volatile("movl %0,%%cr3": :"r" (val));
132}
133
134static inline unsigned long native_read_cr4(void)
135{
136 unsigned long val;
137 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
138 return val;
139}
140
141static inline unsigned long native_read_cr4_safe(void)
142{
143 unsigned long val;
144 /* This could fault if %cr4 does not exist */
145 asm volatile("1: movl %%cr4, %0 \n"
146 "2: \n"
147 ".section __ex_table,\"a\" \n"
148 ".long 1b,2b \n"
149 ".previous \n"
150 : "=r" (val): "0" (0));
151 return val;
152}
153
154static inline void native_write_cr4(unsigned long val)
155{
156 asm volatile("movl %0,%%cr4": :"r" (val));
157}
158
159static inline void native_wbinvd(void)
160{
161 asm volatile("wbinvd": : :"memory");
162}
163
164static inline void clflush(volatile void *__p)
165{
166 asm volatile("clflush %0" : "+m" (*(char __force *)__p));
167}
168
169#ifdef CONFIG_PARAVIRT
170#include <asm/paravirt.h>
171#else
172#define read_cr0() (native_read_cr0())
173#define write_cr0(x) (native_write_cr0(x))
174#define read_cr2() (native_read_cr2())
175#define write_cr2(x) (native_write_cr2(x))
176#define read_cr3() (native_read_cr3())
177#define write_cr3(x) (native_write_cr3(x))
178#define read_cr4() (native_read_cr4())
179#define read_cr4_safe() (native_read_cr4_safe())
180#define write_cr4(x) (native_write_cr4(x))
181#define wbinvd() (native_wbinvd())
182
183/* Clear the 'TS' bit */
184#define clts() (native_clts())
185
186#endif/* CONFIG_PARAVIRT */
187
188/* Set the 'TS' bit */
189#define stts() write_cr0(8 | read_cr0())
190
191#endif /* __KERNEL__ */
192
193static inline unsigned long get_limit(unsigned long segment)
194{
195 unsigned long __limit;
196 __asm__("lsll %1,%0"
197 :"=r" (__limit):"r" (segment));
198 return __limit+1;
199}
200
201#define nop() __asm__ __volatile__ ("nop")
202
203/*
204 * Force strict CPU ordering.
205 * And yes, this is required on UP too when we're talking
206 * to devices.
207 *
208 * For now, "wmb()" doesn't actually do anything, as all
209 * Intel CPU's follow what Intel calls a *Processor Order*,
210 * in which all writes are seen in the program order even
211 * outside the CPU.
212 *
213 * I expect future Intel CPU's to have a weaker ordering,
214 * but I'd also expect them to finally get their act together
215 * and add some real memory barriers if so.
216 *
217 * Some non intel clones support out of order store. wmb() ceases to be a
218 * nop for these.
219 */
220
221
222#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
223#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
224#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
225
226/**
227 * read_barrier_depends - Flush all pending reads that subsequents reads
228 * depend on.
229 *
230 * No data-dependent reads from memory-like regions are ever reordered
231 * over this barrier. All reads preceding this primitive are guaranteed
232 * to access memory (but not necessarily other CPUs' caches) before any
233 * reads following this primitive that depend on the data return by
234 * any of the preceding reads. This primitive is much lighter weight than
235 * rmb() on most CPUs, and is never heavier weight than is
236 * rmb().
237 *
238 * These ordering constraints are respected by both the local CPU
239 * and the compiler.
240 *
241 * Ordering is not guaranteed by anything other than these primitives,
242 * not even by data dependencies. See the documentation for
243 * memory_barrier() for examples and URLs to more information.
244 *
245 * For example, the following code would force ordering (the initial
246 * value of "a" is zero, "b" is one, and "p" is "&a"):
247 *
248 * <programlisting>
249 * CPU 0 CPU 1
250 *
251 * b = 2;
252 * memory_barrier();
253 * p = &b; q = p;
254 * read_barrier_depends();
255 * d = *q;
256 * </programlisting>
257 *
258 * because the read of "*q" depends on the read of "p" and these
259 * two reads are separated by a read_barrier_depends(). However,
260 * the following code, with the same initial values for "a" and "b":
261 *
262 * <programlisting>
263 * CPU 0 CPU 1
264 *
265 * a = 2;
266 * memory_barrier();
267 * b = 3; y = b;
268 * read_barrier_depends();
269 * x = a;
270 * </programlisting>
271 *
272 * does not enforce ordering, since there is no data dependency between
273 * the read of "a" and the read of "b". Therefore, on some CPUs, such
274 * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
275 * in cases like this where there are no data dependencies.
276 **/
277
278#define read_barrier_depends() do { } while(0)
279
280#ifdef CONFIG_SMP
281#define smp_mb() mb()
282#ifdef CONFIG_X86_PPRO_FENCE
283# define smp_rmb() rmb()
284#else
285# define smp_rmb() barrier()
286#endif
287#ifdef CONFIG_X86_OOSTORE
288# define smp_wmb() wmb()
289#else
290# define smp_wmb() barrier()
291#endif
292#define smp_read_barrier_depends() read_barrier_depends()
293#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
294#else
295#define smp_mb() barrier()
296#define smp_rmb() barrier()
297#define smp_wmb() barrier()
298#define smp_read_barrier_depends() do { } while(0)
299#define set_mb(var, value) do { var = value; barrier(); } while (0)
300#endif
301
302#include <linux/irqflags.h>
303
304/*
305 * disable hlt during certain critical i/o operations
306 */
307#define HAVE_DISABLE_HLT
308void disable_hlt(void);
309void enable_hlt(void);
310
311extern int es7000_plat;
312void cpu_idle_wait(void);
313
314extern unsigned long arch_align_stack(unsigned long sp);
315extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
316
317void default_idle(void);
318void __show_registers(struct pt_regs *, int all);
319
320#endif
diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
index 6e9e4841a2da..97fa251ccb2b 100644
--- a/include/asm-x86/system_64.h
+++ b/include/asm-x86/system_64.h
@@ -1,126 +1,9 @@
1#ifndef __ASM_SYSTEM_H 1#ifndef __ASM_SYSTEM_H
2#define __ASM_SYSTEM_H 2#define __ASM_SYSTEM_H
3 3
4#include <linux/kernel.h>
5#include <asm/segment.h> 4#include <asm/segment.h>
6#include <asm/cmpxchg.h> 5#include <asm/cmpxchg.h>
7 6
8#ifdef __KERNEL__
9
10/* entries in ARCH_DLINFO: */
11#ifdef CONFIG_IA32_EMULATION
12# define AT_VECTOR_SIZE_ARCH 2
13#else
14# define AT_VECTOR_SIZE_ARCH 1
15#endif
16
17#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
18#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
19
20/* frame pointer must be last for get_wchan */
21#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
22#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
23
24#define __EXTRA_CLOBBER \
25 ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
26
27/* Save restore flags to clear handle leaking NT */
28#define switch_to(prev,next,last) \
29 asm volatile(SAVE_CONTEXT \
30 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
31 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
32 "call __switch_to\n\t" \
33 ".globl thread_return\n" \
34 "thread_return:\n\t" \
35 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
36 "movq %P[thread_info](%%rsi),%%r8\n\t" \
37 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
38 "movq %%rax,%%rdi\n\t" \
39 "jc ret_from_fork\n\t" \
40 RESTORE_CONTEXT \
41 : "=a" (last) \
42 : [next] "S" (next), [prev] "D" (prev), \
43 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
44 [ti_flags] "i" (offsetof(struct thread_info, flags)),\
45 [tif_fork] "i" (TIF_FORK), \
46 [thread_info] "i" (offsetof(struct task_struct, stack)), \
47 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
48 : "memory", "cc" __EXTRA_CLOBBER)
49
50extern void load_gs_index(unsigned);
51
52/*
53 * Load a segment. Fall back on loading the zero
54 * segment if something goes wrong..
55 */
56#define loadsegment(seg,value) \
57 asm volatile("\n" \
58 "1:\t" \
59 "movl %k0,%%" #seg "\n" \
60 "2:\n" \
61 ".section .fixup,\"ax\"\n" \
62 "3:\t" \
63 "movl %1,%%" #seg "\n\t" \
64 "jmp 2b\n" \
65 ".previous\n" \
66 ".section __ex_table,\"a\"\n\t" \
67 ".align 8\n\t" \
68 ".quad 1b,3b\n" \
69 ".previous" \
70 : :"r" (value), "r" (0))
71
72/*
73 * Clear and set 'TS' bit respectively
74 */
75#define clts() __asm__ __volatile__ ("clts")
76
77static inline unsigned long read_cr0(void)
78{
79 unsigned long cr0;
80 asm volatile("movq %%cr0,%0" : "=r" (cr0));
81 return cr0;
82}
83
84static inline void write_cr0(unsigned long val)
85{
86 asm volatile("movq %0,%%cr0" :: "r" (val));
87}
88
89static inline unsigned long read_cr2(void)
90{
91 unsigned long cr2;
92 asm volatile("movq %%cr2,%0" : "=r" (cr2));
93 return cr2;
94}
95
96static inline void write_cr2(unsigned long val)
97{
98 asm volatile("movq %0,%%cr2" :: "r" (val));
99}
100
101static inline unsigned long read_cr3(void)
102{
103 unsigned long cr3;
104 asm volatile("movq %%cr3,%0" : "=r" (cr3));
105 return cr3;
106}
107
108static inline void write_cr3(unsigned long val)
109{
110 asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
111}
112
113static inline unsigned long read_cr4(void)
114{
115 unsigned long cr4;
116 asm volatile("movq %%cr4,%0" : "=r" (cr4));
117 return cr4;
118}
119
120static inline void write_cr4(unsigned long val)
121{
122 asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
123}
124 7
125static inline unsigned long read_cr8(void) 8static inline unsigned long read_cr8(void)
126{ 9{
@@ -134,52 +17,6 @@ static inline void write_cr8(unsigned long val)
134 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); 17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
135} 18}
136 19
137#define stts() write_cr0(8 | read_cr0())
138
139#define wbinvd() \
140 __asm__ __volatile__ ("wbinvd": : :"memory")
141
142#endif /* __KERNEL__ */
143
144static inline void clflush(volatile void *__p)
145{
146 asm volatile("clflush %0" : "+m" (*(char __force *)__p));
147}
148
149#define nop() __asm__ __volatile__ ("nop")
150
151#ifdef CONFIG_SMP
152#define smp_mb() mb()
153#define smp_rmb() barrier()
154#define smp_wmb() barrier()
155#define smp_read_barrier_depends() do {} while(0)
156#else
157#define smp_mb() barrier()
158#define smp_rmb() barrier()
159#define smp_wmb() barrier()
160#define smp_read_barrier_depends() do {} while(0)
161#endif
162
163
164/*
165 * Force strict CPU ordering.
166 * And yes, this is required on UP too when we're talking
167 * to devices.
168 */
169#define mb() asm volatile("mfence":::"memory")
170#define rmb() asm volatile("lfence":::"memory")
171#define wmb() asm volatile("sfence" ::: "memory")
172
173#define read_barrier_depends() do {} while(0)
174#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
175
176#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
177
178#include <linux/irqflags.h> 20#include <linux/irqflags.h>
179 21
180void cpu_idle_wait(void);
181
182extern unsigned long arch_align_stack(unsigned long sp);
183extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
184
185#endif 22#endif
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index a516e9192f11..5bd508260ffb 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -138,6 +138,10 @@ static inline struct thread_info *current_thread_info(void)
138#define TIF_IO_BITMAP 18 /* uses I/O bitmap */ 138#define TIF_IO_BITMAP 18 /* uses I/O bitmap */
139#define TIF_FREEZE 19 /* is freezing for suspend */ 139#define TIF_FREEZE 19 /* is freezing for suspend */
140#define TIF_NOTSC 20 /* TSC is not accessible in userland */ 140#define TIF_NOTSC 20 /* TSC is not accessible in userland */
141#define TIF_FORCED_TF 21 /* true if TF in eflags artificially */
142#define TIF_DEBUGCTLMSR 22 /* uses thread_struct.debugctlmsr */
143#define TIF_DS_AREA_MSR 23 /* uses thread_struct.ds_area_msr */
144#define TIF_BTS_TRACE_TS 24 /* record scheduling event timestamps */
141 145
142#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 146#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
143#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 147#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
@@ -153,6 +157,10 @@ static inline struct thread_info *current_thread_info(void)
153#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP) 157#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP)
154#define _TIF_FREEZE (1<<TIF_FREEZE) 158#define _TIF_FREEZE (1<<TIF_FREEZE)
155#define _TIF_NOTSC (1<<TIF_NOTSC) 159#define _TIF_NOTSC (1<<TIF_NOTSC)
160#define _TIF_FORCED_TF (1<<TIF_FORCED_TF)
161#define _TIF_DEBUGCTLMSR (1<<TIF_DEBUGCTLMSR)
162#define _TIF_DS_AREA_MSR (1<<TIF_DS_AREA_MSR)
163#define _TIF_BTS_TRACE_TS (1<<TIF_BTS_TRACE_TS)
156 164
157/* work to do on interrupt/exception return */ 165/* work to do on interrupt/exception return */
158#define _TIF_WORK_MASK \ 166#define _TIF_WORK_MASK \
@@ -162,8 +170,12 @@ static inline struct thread_info *current_thread_info(void)
162#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) 170#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
163 171
164/* flags to check in __switch_to() */ 172/* flags to check in __switch_to() */
165#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG) 173#define _TIF_WORK_CTXSW \
166#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC) 174 (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \
175 _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS)
176#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
177#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG)
178
167 179
168/* 180/*
169 * Thread-synchronous status. 181 * Thread-synchronous status.
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index 7f6ee68f0002..9b531ea015a8 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -21,7 +21,7 @@
21#ifndef __ASSEMBLY__ 21#ifndef __ASSEMBLY__
22struct task_struct; 22struct task_struct;
23struct exec_domain; 23struct exec_domain;
24#include <asm/mmsegment.h> 24#include <asm/processor.h>
25 25
26struct thread_info { 26struct thread_info {
27 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
@@ -33,6 +33,9 @@ struct thread_info {
33 33
34 mm_segment_t addr_limit; 34 mm_segment_t addr_limit;
35 struct restart_block restart_block; 35 struct restart_block restart_block;
36#ifdef CONFIG_IA32_EMULATION
37 void __user *sysenter_return;
38#endif
36}; 39};
37#endif 40#endif
38 41
@@ -74,20 +77,14 @@ static inline struct thread_info *stack_thread_info(void)
74 77
75/* thread information allocation */ 78/* thread information allocation */
76#ifdef CONFIG_DEBUG_STACK_USAGE 79#ifdef CONFIG_DEBUG_STACK_USAGE
77#define alloc_thread_info(tsk) \ 80#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
78 ({ \
79 struct thread_info *ret; \
80 \
81 ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)); \
82 if (ret) \
83 memset(ret, 0, THREAD_SIZE); \
84 ret; \
85 })
86#else 81#else
87#define alloc_thread_info(tsk) \ 82#define THREAD_FLAGS GFP_KERNEL
88 ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
89#endif 83#endif
90 84
85#define alloc_thread_info(tsk) \
86 ((struct thread_info *) __get_free_pages(THREAD_FLAGS, THREAD_ORDER))
87
91#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) 88#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
92 89
93#else /* !__ASSEMBLY__ */ 90#else /* !__ASSEMBLY__ */
@@ -124,6 +121,10 @@ static inline struct thread_info *stack_thread_info(void)
124#define TIF_DEBUG 21 /* uses debug registers */ 121#define TIF_DEBUG 21 /* uses debug registers */
125#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 122#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
126#define TIF_FREEZE 23 /* is freezing for suspend */ 123#define TIF_FREEZE 23 /* is freezing for suspend */
124#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
125#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
126#define TIF_DS_AREA_MSR 25 /* uses thread_struct.ds_area_msr */
127#define TIF_BTS_TRACE_TS 26 /* record scheduling event timestamps */
127 128
128#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 129#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
129#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 130#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
@@ -141,6 +142,10 @@ static inline struct thread_info *stack_thread_info(void)
141#define _TIF_DEBUG (1<<TIF_DEBUG) 142#define _TIF_DEBUG (1<<TIF_DEBUG)
142#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP) 143#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP)
143#define _TIF_FREEZE (1<<TIF_FREEZE) 144#define _TIF_FREEZE (1<<TIF_FREEZE)
145#define _TIF_FORCED_TF (1<<TIF_FORCED_TF)
146#define _TIF_DEBUGCTLMSR (1<<TIF_DEBUGCTLMSR)
147#define _TIF_DS_AREA_MSR (1<<TIF_DS_AREA_MSR)
148#define _TIF_BTS_TRACE_TS (1<<TIF_BTS_TRACE_TS)
144 149
145/* work to do on interrupt/exception return */ 150/* work to do on interrupt/exception return */
146#define _TIF_WORK_MASK \ 151#define _TIF_WORK_MASK \
@@ -152,7 +157,10 @@ static inline struct thread_info *stack_thread_info(void)
152 (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) 157 (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
153 158
154/* flags to check in __switch_to() */ 159/* flags to check in __switch_to() */
155#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP) 160#define _TIF_WORK_CTXSW \
161 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
162#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
163#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
156 164
157#define PREEMPT_ACTIVE 0x10000000 165#define PREEMPT_ACTIVE 0x10000000
158 166
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index eac011366dc2..68779b048a3e 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -1,8 +1,12 @@
1#ifndef _ASMi386_TIME_H 1#ifndef _ASMX86_TIME_H
2#define _ASMi386_TIME_H 2#define _ASMX86_TIME_H
3 3
4extern void (*late_time_init)(void);
5extern void hpet_time_init(void);
6
7#include <asm/mc146818rtc.h>
8#ifdef CONFIG_X86_32
4#include <linux/efi.h> 9#include <linux/efi.h>
5#include "mach_time.h"
6 10
7static inline unsigned long native_get_wallclock(void) 11static inline unsigned long native_get_wallclock(void)
8{ 12{
@@ -28,8 +32,20 @@ static inline int native_set_wallclock(unsigned long nowtime)
28 return retval; 32 return retval;
29} 33}
30 34
31extern void (*late_time_init)(void); 35#else
32extern void hpet_time_init(void); 36extern void native_time_init_hook(void);
37
38static inline unsigned long native_get_wallclock(void)
39{
40 return mach_get_cmos_time();
41}
42
43static inline int native_set_wallclock(unsigned long nowtime)
44{
45 return mach_set_rtc_mmss(nowtime);
46}
47
48#endif
33 49
34#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
35#include <asm/paravirt.h> 51#include <asm/paravirt.h>
diff --git a/include/asm-x86/timer.h b/include/asm-x86/timer.h
index 0db7e994fb8b..4f6fcb050c11 100644
--- a/include/asm-x86/timer.h
+++ b/include/asm-x86/timer.h
@@ -2,6 +2,7 @@
2#define _ASMi386_TIMER_H 2#define _ASMi386_TIMER_H
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/percpu.h>
5 6
6#define TICK_SIZE (tick_nsec / 1000) 7#define TICK_SIZE (tick_nsec / 1000)
7 8
@@ -16,7 +17,7 @@ extern int recalibrate_cpu_khz(void);
16#define calculate_cpu_khz() native_calculate_cpu_khz() 17#define calculate_cpu_khz() native_calculate_cpu_khz()
17#endif 18#endif
18 19
19/* Accellerators for sched_clock() 20/* Accelerators for sched_clock()
20 * convert from cycles(64bits) => nanoseconds (64bits) 21 * convert from cycles(64bits) => nanoseconds (64bits)
21 * basic equation: 22 * basic equation:
22 * ns = cycles / (freq / ns_per_sec) 23 * ns = cycles / (freq / ns_per_sec)
@@ -31,20 +32,32 @@ extern int recalibrate_cpu_khz(void);
31 * And since SC is a constant power of two, we can convert the div 32 * And since SC is a constant power of two, we can convert the div
32 * into a shift. 33 * into a shift.
33 * 34 *
34 * We can use khz divisor instead of mhz to keep a better percision, since 35 * We can use khz divisor instead of mhz to keep a better precision, since
35 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. 36 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
36 * (mathieu.desnoyers@polymtl.ca) 37 * (mathieu.desnoyers@polymtl.ca)
37 * 38 *
38 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 39 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
39 */ 40 */
40extern unsigned long cyc2ns_scale __read_mostly; 41
42DECLARE_PER_CPU(unsigned long, cyc2ns);
41 43
42#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 44#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
43 45
44static inline unsigned long long cycles_2_ns(unsigned long long cyc) 46static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
45{ 47{
46 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; 48 return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
47} 49}
48 50
51static inline unsigned long long cycles_2_ns(unsigned long long cyc)
52{
53 unsigned long long ns;
54 unsigned long flags;
55
56 local_irq_save(flags);
57 ns = __cycles_2_ns(cyc);
58 local_irq_restore(flags);
59
60 return ns;
61}
49 62
50#endif 63#endif
diff --git a/include/asm-x86/timex.h b/include/asm-x86/timex.h
index 39a21ab030f0..27cfd6c599ba 100644
--- a/include/asm-x86/timex.h
+++ b/include/asm-x86/timex.h
@@ -7,6 +7,8 @@
7 7
8#ifdef CONFIG_X86_ELAN 8#ifdef CONFIG_X86_ELAN
9# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */ 9# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
10#elif defined(CONFIG_X86_RDC321X)
11# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
10#else 12#else
11# define PIT_TICK_RATE 1193182 /* Underlying HZ */ 13# define PIT_TICK_RATE 1193182 /* Underlying HZ */
12#endif 14#endif
diff --git a/include/asm-x86/tlbflush.h b/include/asm-x86/tlbflush.h
index 9af4cc83a1af..3998709ed637 100644
--- a/include/asm-x86/tlbflush.h
+++ b/include/asm-x86/tlbflush.h
@@ -1,5 +1,158 @@
1#ifndef _ASM_X86_TLBFLUSH_H
2#define _ASM_X86_TLBFLUSH_H
3
4#include <linux/mm.h>
5#include <linux/sched.h>
6
7#include <asm/processor.h>
8#include <asm/system.h>
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define __flush_tlb() __native_flush_tlb()
14#define __flush_tlb_global() __native_flush_tlb_global()
15#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
16#endif
17
18static inline void __native_flush_tlb(void)
19{
20 write_cr3(read_cr3());
21}
22
23static inline void __native_flush_tlb_global(void)
24{
25 unsigned long cr4 = read_cr4();
26
27 /* clear PGE */
28 write_cr4(cr4 & ~X86_CR4_PGE);
29 /* write old PGE again and flush TLBs */
30 write_cr4(cr4);
31}
32
33static inline void __native_flush_tlb_single(unsigned long addr)
34{
35 __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory");
36}
37
38static inline void __flush_tlb_all(void)
39{
40 if (cpu_has_pge)
41 __flush_tlb_global();
42 else
43 __flush_tlb();
44}
45
46static inline void __flush_tlb_one(unsigned long addr)
47{
48 if (cpu_has_invlpg)
49 __flush_tlb_single(addr);
50 else
51 __flush_tlb();
52}
53
1#ifdef CONFIG_X86_32 54#ifdef CONFIG_X86_32
2# include "tlbflush_32.h" 55# define TLB_FLUSH_ALL 0xffffffff
3#else 56#else
4# include "tlbflush_64.h" 57# define TLB_FLUSH_ALL -1ULL
58#endif
59
60/*
61 * TLB flushing:
62 *
63 * - flush_tlb() flushes the current mm struct TLBs
64 * - flush_tlb_all() flushes all processes TLBs
65 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
66 * - flush_tlb_page(vma, vmaddr) flushes one page
67 * - flush_tlb_range(vma, start, end) flushes a range of pages
68 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
69 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
70 *
71 * ..but the i386 has somewhat limited tlb flushing capabilities,
72 * and page-granular flushes are available only on i486 and up.
73 *
74 * x86-64 can only flush individual pages or full VMs. For a range flush
75 * we always do the full VM. Might be worth trying if for a small
76 * range a few INVLPGs in a row are a win.
77 */
78
79#ifndef CONFIG_SMP
80
81#define flush_tlb() __flush_tlb()
82#define flush_tlb_all() __flush_tlb_all()
83#define local_flush_tlb() __flush_tlb()
84
85static inline void flush_tlb_mm(struct mm_struct *mm)
86{
87 if (mm == current->active_mm)
88 __flush_tlb();
89}
90
91static inline void flush_tlb_page(struct vm_area_struct *vma,
92 unsigned long addr)
93{
94 if (vma->vm_mm == current->active_mm)
95 __flush_tlb_one(addr);
96}
97
98static inline void flush_tlb_range(struct vm_area_struct *vma,
99 unsigned long start, unsigned long end)
100{
101 if (vma->vm_mm == current->active_mm)
102 __flush_tlb();
103}
104
105static inline void native_flush_tlb_others(const cpumask_t *cpumask,
106 struct mm_struct *mm,
107 unsigned long va)
108{
109}
110
111#else /* SMP */
112
113#include <asm/smp.h>
114
115#define local_flush_tlb() __flush_tlb()
116
117extern void flush_tlb_all(void);
118extern void flush_tlb_current_task(void);
119extern void flush_tlb_mm(struct mm_struct *);
120extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
121
122#define flush_tlb() flush_tlb_current_task()
123
124static inline void flush_tlb_range(struct vm_area_struct *vma,
125 unsigned long start, unsigned long end)
126{
127 flush_tlb_mm(vma->vm_mm);
128}
129
130void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
131 unsigned long va);
132
133#define TLBSTATE_OK 1
134#define TLBSTATE_LAZY 2
135
136#ifdef CONFIG_X86_32
137struct tlb_state
138{
139 struct mm_struct *active_mm;
140 int state;
141 char __cacheline_padding[L1_CACHE_BYTES-8];
142};
143DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
144#endif
145
146#endif /* SMP */
147
148#ifndef CONFIG_PARAVIRT
149#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
5#endif 150#endif
151
152static inline void flush_tlb_kernel_range(unsigned long start,
153 unsigned long end)
154{
155 flush_tlb_all();
156}
157
158#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/include/asm-x86/tlbflush_32.h b/include/asm-x86/tlbflush_32.h
deleted file mode 100644
index 2bd5b95e2048..000000000000
--- a/include/asm-x86/tlbflush_32.h
+++ /dev/null
@@ -1,168 +0,0 @@
1#ifndef _I386_TLBFLUSH_H
2#define _I386_TLBFLUSH_H
3
4#include <linux/mm.h>
5#include <asm/processor.h>
6
7#ifdef CONFIG_PARAVIRT
8#include <asm/paravirt.h>
9#else
10#define __flush_tlb() __native_flush_tlb()
11#define __flush_tlb_global() __native_flush_tlb_global()
12#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
13#endif
14
15#define __native_flush_tlb() \
16 do { \
17 unsigned int tmpreg; \
18 \
19 __asm__ __volatile__( \
20 "movl %%cr3, %0; \n" \
21 "movl %0, %%cr3; # flush TLB \n" \
22 : "=r" (tmpreg) \
23 :: "memory"); \
24 } while (0)
25
26/*
27 * Global pages have to be flushed a bit differently. Not a real
28 * performance problem because this does not happen often.
29 */
30#define __native_flush_tlb_global() \
31 do { \
32 unsigned int tmpreg, cr4, cr4_orig; \
33 \
34 __asm__ __volatile__( \
35 "movl %%cr4, %2; # turn off PGE \n" \
36 "movl %2, %1; \n" \
37 "andl %3, %1; \n" \
38 "movl %1, %%cr4; \n" \
39 "movl %%cr3, %0; \n" \
40 "movl %0, %%cr3; # flush TLB \n" \
41 "movl %2, %%cr4; # turn PGE back on \n" \
42 : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \
43 : "i" (~X86_CR4_PGE) \
44 : "memory"); \
45 } while (0)
46
47#define __native_flush_tlb_single(addr) \
48 __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
49
50# define __flush_tlb_all() \
51 do { \
52 if (cpu_has_pge) \
53 __flush_tlb_global(); \
54 else \
55 __flush_tlb(); \
56 } while (0)
57
58#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
59
60#ifdef CONFIG_X86_INVLPG
61# define __flush_tlb_one(addr) __flush_tlb_single(addr)
62#else
63# define __flush_tlb_one(addr) \
64 do { \
65 if (cpu_has_invlpg) \
66 __flush_tlb_single(addr); \
67 else \
68 __flush_tlb(); \
69 } while (0)
70#endif
71
72/*
73 * TLB flushing:
74 *
75 * - flush_tlb() flushes the current mm struct TLBs
76 * - flush_tlb_all() flushes all processes TLBs
77 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
78 * - flush_tlb_page(vma, vmaddr) flushes one page
79 * - flush_tlb_range(vma, start, end) flushes a range of pages
80 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
81 * - flush_tlb_others(cpumask, mm, va) flushes a TLBs on other cpus
82 *
83 * ..but the i386 has somewhat limited tlb flushing capabilities,
84 * and page-granular flushes are available only on i486 and up.
85 */
86
87#define TLB_FLUSH_ALL 0xffffffff
88
89
90#ifndef CONFIG_SMP
91
92#include <linux/sched.h>
93
94#define flush_tlb() __flush_tlb()
95#define flush_tlb_all() __flush_tlb_all()
96#define local_flush_tlb() __flush_tlb()
97
98static inline void flush_tlb_mm(struct mm_struct *mm)
99{
100 if (mm == current->active_mm)
101 __flush_tlb();
102}
103
104static inline void flush_tlb_page(struct vm_area_struct *vma,
105 unsigned long addr)
106{
107 if (vma->vm_mm == current->active_mm)
108 __flush_tlb_one(addr);
109}
110
111static inline void flush_tlb_range(struct vm_area_struct *vma,
112 unsigned long start, unsigned long end)
113{
114 if (vma->vm_mm == current->active_mm)
115 __flush_tlb();
116}
117
118static inline void native_flush_tlb_others(const cpumask_t *cpumask,
119 struct mm_struct *mm, unsigned long va)
120{
121}
122
123#else /* SMP */
124
125#include <asm/smp.h>
126
127#define local_flush_tlb() \
128 __flush_tlb()
129
130extern void flush_tlb_all(void);
131extern void flush_tlb_current_task(void);
132extern void flush_tlb_mm(struct mm_struct *);
133extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
134
135#define flush_tlb() flush_tlb_current_task()
136
137static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
138{
139 flush_tlb_mm(vma->vm_mm);
140}
141
142void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
143 unsigned long va);
144
145#define TLBSTATE_OK 1
146#define TLBSTATE_LAZY 2
147
148struct tlb_state
149{
150 struct mm_struct *active_mm;
151 int state;
152 char __cacheline_padding[L1_CACHE_BYTES-8];
153};
154DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
155#endif /* SMP */
156
157#ifndef CONFIG_PARAVIRT
158#define flush_tlb_others(mask, mm, va) \
159 native_flush_tlb_others(&mask, mm, va)
160#endif
161
162static inline void flush_tlb_kernel_range(unsigned long start,
163 unsigned long end)
164{
165 flush_tlb_all();
166}
167
168#endif /* _I386_TLBFLUSH_H */
diff --git a/include/asm-x86/tlbflush_64.h b/include/asm-x86/tlbflush_64.h
deleted file mode 100644
index 7731fd23d572..000000000000
--- a/include/asm-x86/tlbflush_64.h
+++ /dev/null
@@ -1,100 +0,0 @@
1#ifndef _X8664_TLBFLUSH_H
2#define _X8664_TLBFLUSH_H
3
4#include <linux/mm.h>
5#include <linux/sched.h>
6#include <asm/processor.h>
7#include <asm/system.h>
8
9static inline void __flush_tlb(void)
10{
11 write_cr3(read_cr3());
12}
13
14static inline void __flush_tlb_all(void)
15{
16 unsigned long cr4 = read_cr4();
17 write_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */
18 write_cr4(cr4); /* write old PGE again and flush TLBs */
19}
20
21#define __flush_tlb_one(addr) \
22 __asm__ __volatile__("invlpg (%0)" :: "r" (addr) : "memory")
23
24
25/*
26 * TLB flushing:
27 *
28 * - flush_tlb() flushes the current mm struct TLBs
29 * - flush_tlb_all() flushes all processes TLBs
30 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
31 * - flush_tlb_page(vma, vmaddr) flushes one page
32 * - flush_tlb_range(vma, start, end) flushes a range of pages
33 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
34 *
35 * x86-64 can only flush individual pages or full VMs. For a range flush
36 * we always do the full VM. Might be worth trying if for a small
37 * range a few INVLPGs in a row are a win.
38 */
39
40#ifndef CONFIG_SMP
41
42#define flush_tlb() __flush_tlb()
43#define flush_tlb_all() __flush_tlb_all()
44#define local_flush_tlb() __flush_tlb()
45
46static inline void flush_tlb_mm(struct mm_struct *mm)
47{
48 if (mm == current->active_mm)
49 __flush_tlb();
50}
51
52static inline void flush_tlb_page(struct vm_area_struct *vma,
53 unsigned long addr)
54{
55 if (vma->vm_mm == current->active_mm)
56 __flush_tlb_one(addr);
57}
58
59static inline void flush_tlb_range(struct vm_area_struct *vma,
60 unsigned long start, unsigned long end)
61{
62 if (vma->vm_mm == current->active_mm)
63 __flush_tlb();
64}
65
66#else
67
68#include <asm/smp.h>
69
70#define local_flush_tlb() \
71 __flush_tlb()
72
73extern void flush_tlb_all(void);
74extern void flush_tlb_current_task(void);
75extern void flush_tlb_mm(struct mm_struct *);
76extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
77
78#define flush_tlb() flush_tlb_current_task()
79
80static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
81{
82 flush_tlb_mm(vma->vm_mm);
83}
84
85#define TLBSTATE_OK 1
86#define TLBSTATE_LAZY 2
87
88/* Roughly an IPI every 20MB with 4k pages for freeing page table
89 ranges. Cost is about 42k of memory for each CPU. */
90#define ARCH_FREE_PTE_NR 5350
91
92#endif
93
94static inline void flush_tlb_kernel_range(unsigned long start,
95 unsigned long end)
96{
97 flush_tlb_all();
98}
99
100#endif /* _X8664_TLBFLUSH_H */
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index b10fde9798ea..8af05a93f097 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -1,5 +1,188 @@
1/*
2 * Written by: Matthew Dobson, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <colpatch@us.ibm.com>
24 */
25#ifndef _ASM_X86_TOPOLOGY_H
26#define _ASM_X86_TOPOLOGY_H
27
28#ifdef CONFIG_NUMA
29#include <linux/cpumask.h>
30#include <asm/mpspec.h>
31
32/* Mappings between logical cpu number and node number */
1#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
2# include "topology_32.h" 34extern int cpu_to_node_map[];
35
3#else 36#else
4# include "topology_64.h" 37DECLARE_PER_CPU(int, x86_cpu_to_node_map);
38extern int x86_cpu_to_node_map_init[];
39extern void *x86_cpu_to_node_map_early_ptr;
40/* Returns the number of the current Node. */
41#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id()))
42#endif
43
44extern cpumask_t node_to_cpumask_map[];
45
46#define NUMA_NO_NODE (-1)
47
48/* Returns the number of the node containing CPU 'cpu' */
49#ifdef CONFIG_X86_32
50#define early_cpu_to_node(cpu) cpu_to_node(cpu)
51static inline int cpu_to_node(int cpu)
52{
53 return cpu_to_node_map[cpu];
54}
55
56#else /* CONFIG_X86_64 */
57static inline int early_cpu_to_node(int cpu)
58{
59 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
60
61 if (cpu_to_node_map)
62 return cpu_to_node_map[cpu];
63 else if (per_cpu_offset(cpu))
64 return per_cpu(x86_cpu_to_node_map, cpu);
65 else
66 return NUMA_NO_NODE;
67}
68
69static inline int cpu_to_node(int cpu)
70{
71#ifdef CONFIG_DEBUG_PER_CPU_MAPS
72 if (x86_cpu_to_node_map_early_ptr) {
73 printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n",
74 (int)cpu);
75 dump_stack();
76 return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
77 }
78#endif
79 if (per_cpu_offset(cpu))
80 return per_cpu(x86_cpu_to_node_map, cpu);
81 else
82 return NUMA_NO_NODE;
83}
84#endif /* CONFIG_X86_64 */
85
86/*
87 * Returns the number of the node containing Node 'node'. This
88 * architecture is flat, so it is a pretty simple function!
89 */
90#define parent_node(node) (node)
91
92/* Returns a bitmask of CPUs on Node 'node'. */
93static inline cpumask_t node_to_cpumask(int node)
94{
95 return node_to_cpumask_map[node];
96}
97
98/* Returns the number of the first CPU on Node 'node'. */
99static inline int node_to_first_cpu(int node)
100{
101 cpumask_t mask = node_to_cpumask(node);
102
103 return first_cpu(mask);
104}
105
106#define pcibus_to_node(bus) __pcibus_to_node(bus)
107#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
108
109#ifdef CONFIG_X86_32
110extern unsigned long node_start_pfn[];
111extern unsigned long node_end_pfn[];
112extern unsigned long node_remap_size[];
113#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
114
115# ifdef CONFIG_X86_HT
116# define ENABLE_TOPO_DEFINES
117# endif
118
119# define SD_CACHE_NICE_TRIES 1
120# define SD_IDLE_IDX 1
121# define SD_NEWIDLE_IDX 2
122# define SD_FORKEXEC_IDX 0
123
124#else
125
126# ifdef CONFIG_SMP
127# define ENABLE_TOPO_DEFINES
128# endif
129
130# define SD_CACHE_NICE_TRIES 2
131# define SD_IDLE_IDX 2
132# define SD_NEWIDLE_IDX 0
133# define SD_FORKEXEC_IDX 1
134
135#endif
136
137/* sched_domains SD_NODE_INIT for NUMAQ machines */
138#define SD_NODE_INIT (struct sched_domain) { \
139 .span = CPU_MASK_NONE, \
140 .parent = NULL, \
141 .child = NULL, \
142 .groups = NULL, \
143 .min_interval = 8, \
144 .max_interval = 32, \
145 .busy_factor = 32, \
146 .imbalance_pct = 125, \
147 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
148 .busy_idx = 3, \
149 .idle_idx = SD_IDLE_IDX, \
150 .newidle_idx = SD_NEWIDLE_IDX, \
151 .wake_idx = 1, \
152 .forkexec_idx = SD_FORKEXEC_IDX, \
153 .flags = SD_LOAD_BALANCE \
154 | SD_BALANCE_EXEC \
155 | SD_BALANCE_FORK \
156 | SD_SERIALIZE \
157 | SD_WAKE_BALANCE, \
158 .last_balance = jiffies, \
159 .balance_interval = 1, \
160 .nr_balance_failed = 0, \
161}
162
163#ifdef CONFIG_X86_64_ACPI_NUMA
164extern int __node_distance(int, int);
165#define node_distance(a, b) __node_distance(a, b)
166#endif
167
168#else /* CONFIG_NUMA */
169
170#include <asm-generic/topology.h>
171
172#endif
173
174extern cpumask_t cpu_coregroup_map(int cpu);
175
176#ifdef ENABLE_TOPO_DEFINES
177#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
178#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
179#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
180#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
181#endif
182
183#ifdef CONFIG_SMP
184#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
185#define smt_capable() (smp_num_siblings > 1)
186#endif
187
5#endif 188#endif
diff --git a/include/asm-x86/topology_32.h b/include/asm-x86/topology_32.h
deleted file mode 100644
index 9040f5a61278..000000000000
--- a/include/asm-x86/topology_32.h
+++ /dev/null
@@ -1,121 +0,0 @@
1/*
2 * linux/include/asm-i386/topology.h
3 *
4 * Written by: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (C) 2002, IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 */
27#ifndef _ASM_I386_TOPOLOGY_H
28#define _ASM_I386_TOPOLOGY_H
29
30#ifdef CONFIG_X86_HT
31#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
32#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
33#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
34#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
35#endif
36
37#ifdef CONFIG_NUMA
38
39#include <asm/mpspec.h>
40
41#include <linux/cpumask.h>
42
43/* Mappings between logical cpu number and node number */
44extern cpumask_t node_2_cpu_mask[];
45extern int cpu_2_node[];
46
47/* Returns the number of the node containing CPU 'cpu' */
48static inline int cpu_to_node(int cpu)
49{
50 return cpu_2_node[cpu];
51}
52
53/* Returns the number of the node containing Node 'node'. This architecture is flat,
54 so it is a pretty simple function! */
55#define parent_node(node) (node)
56
57/* Returns a bitmask of CPUs on Node 'node'. */
58static inline cpumask_t node_to_cpumask(int node)
59{
60 return node_2_cpu_mask[node];
61}
62
63/* Returns the number of the first CPU on Node 'node'. */
64static inline int node_to_first_cpu(int node)
65{
66 cpumask_t mask = node_to_cpumask(node);
67 return first_cpu(mask);
68}
69
70#define pcibus_to_node(bus) ((struct pci_sysdata *)((bus)->sysdata))->node
71#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus))
72
73/* sched_domains SD_NODE_INIT for NUMAQ machines */
74#define SD_NODE_INIT (struct sched_domain) { \
75 .span = CPU_MASK_NONE, \
76 .parent = NULL, \
77 .child = NULL, \
78 .groups = NULL, \
79 .min_interval = 8, \
80 .max_interval = 32, \
81 .busy_factor = 32, \
82 .imbalance_pct = 125, \
83 .cache_nice_tries = 1, \
84 .busy_idx = 3, \
85 .idle_idx = 1, \
86 .newidle_idx = 2, \
87 .wake_idx = 1, \
88 .flags = SD_LOAD_BALANCE \
89 | SD_BALANCE_EXEC \
90 | SD_BALANCE_FORK \
91 | SD_SERIALIZE \
92 | SD_WAKE_BALANCE, \
93 .last_balance = jiffies, \
94 .balance_interval = 1, \
95 .nr_balance_failed = 0, \
96}
97
98extern unsigned long node_start_pfn[];
99extern unsigned long node_end_pfn[];
100extern unsigned long node_remap_size[];
101
102#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
103
104#else /* !CONFIG_NUMA */
105/*
106 * Other i386 platforms should define their own version of the
107 * above macros here.
108 */
109
110#include <asm-generic/topology.h>
111
112#endif /* CONFIG_NUMA */
113
114extern cpumask_t cpu_coregroup_map(int cpu);
115
116#ifdef CONFIG_SMP
117#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
118#define smt_capable() (smp_num_siblings > 1)
119#endif
120
121#endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86/topology_64.h b/include/asm-x86/topology_64.h
deleted file mode 100644
index a718dda037e0..000000000000
--- a/include/asm-x86/topology_64.h
+++ /dev/null
@@ -1,71 +0,0 @@
1#ifndef _ASM_X86_64_TOPOLOGY_H
2#define _ASM_X86_64_TOPOLOGY_H
3
4
5#ifdef CONFIG_NUMA
6
7#include <asm/mpspec.h>
8#include <linux/bitops.h>
9
10extern cpumask_t cpu_online_map;
11
12extern unsigned char cpu_to_node[];
13extern cpumask_t node_to_cpumask[];
14
15#ifdef CONFIG_ACPI_NUMA
16extern int __node_distance(int, int);
17#define node_distance(a,b) __node_distance(a,b)
18/* #else fallback version */
19#endif
20
21#define cpu_to_node(cpu) (cpu_to_node[cpu])
22#define parent_node(node) (node)
23#define node_to_first_cpu(node) (first_cpu(node_to_cpumask[node]))
24#define node_to_cpumask(node) (node_to_cpumask[node])
25#define pcibus_to_node(bus) ((struct pci_sysdata *)((bus)->sysdata))->node
26#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus));
27
28#define numa_node_id() read_pda(nodenumber)
29
30/* sched_domains SD_NODE_INIT for x86_64 machines */
31#define SD_NODE_INIT (struct sched_domain) { \
32 .span = CPU_MASK_NONE, \
33 .parent = NULL, \
34 .child = NULL, \
35 .groups = NULL, \
36 .min_interval = 8, \
37 .max_interval = 32, \
38 .busy_factor = 32, \
39 .imbalance_pct = 125, \
40 .cache_nice_tries = 2, \
41 .busy_idx = 3, \
42 .idle_idx = 2, \
43 .newidle_idx = 0, \
44 .wake_idx = 1, \
45 .forkexec_idx = 1, \
46 .flags = SD_LOAD_BALANCE \
47 | SD_BALANCE_FORK \
48 | SD_BALANCE_EXEC \
49 | SD_SERIALIZE \
50 | SD_WAKE_BALANCE, \
51 .last_balance = jiffies, \
52 .balance_interval = 1, \
53 .nr_balance_failed = 0, \
54}
55
56#endif
57
58#ifdef CONFIG_SMP
59#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
60#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
61#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
62#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
63#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
64#define smt_capable() (smp_num_siblings > 1)
65#endif
66
67#include <asm-generic/topology.h>
68
69extern cpumask_t cpu_coregroup_map(int cpu);
70
71#endif
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 6baab30dc2c8..7d3e27f7d484 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -17,6 +17,8 @@ typedef unsigned long long cycles_t;
17extern unsigned int cpu_khz; 17extern unsigned int cpu_khz;
18extern unsigned int tsc_khz; 18extern unsigned int tsc_khz;
19 19
20extern void disable_TSC(void);
21
20static inline cycles_t get_cycles(void) 22static inline cycles_t get_cycles(void)
21{ 23{
22 unsigned long long ret = 0; 24 unsigned long long ret = 0;
@@ -25,39 +27,22 @@ static inline cycles_t get_cycles(void)
25 if (!cpu_has_tsc) 27 if (!cpu_has_tsc)
26 return 0; 28 return 0;
27#endif 29#endif
28
29#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
30 rdtscll(ret); 30 rdtscll(ret);
31#endif 31
32 return ret; 32 return ret;
33} 33}
34 34
35/* Like get_cycles, but make sure the CPU is synchronized. */ 35static inline cycles_t vget_cycles(void)
36static __always_inline cycles_t get_cycles_sync(void)
37{ 36{
38 unsigned long long ret;
39 unsigned eax, edx;
40
41 /*
42 * Use RDTSCP if possible; it is guaranteed to be synchronous
43 * and doesn't cause a VMEXIT on Hypervisors
44 */
45 alternative_io(ASM_NOP3, ".byte 0x0f,0x01,0xf9", X86_FEATURE_RDTSCP,
46 ASM_OUTPUT2("=a" (eax), "=d" (edx)),
47 "a" (0U), "d" (0U) : "ecx", "memory");
48 ret = (((unsigned long long)edx) << 32) | ((unsigned long long)eax);
49 if (ret)
50 return ret;
51
52 /* 37 /*
53 * Don't do an additional sync on CPUs where we know 38 * We only do VDSOs on TSC capable CPUs, so this shouldnt
54 * RDTSC is already synchronous: 39 * access boot_cpu_data (which is not VDSO-safe):
55 */ 40 */
56 alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, 41#ifndef CONFIG_X86_TSC
57 "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); 42 if (!cpu_has_tsc)
58 rdtscll(ret); 43 return 0;
59 44#endif
60 return ret; 45 return (cycles_t) __native_read_tsc();
61} 46}
62 47
63extern void tsc_init(void); 48extern void tsc_init(void);
@@ -73,8 +58,7 @@ int check_tsc_unstable(void);
73extern void check_tsc_sync_source(int cpu); 58extern void check_tsc_sync_source(int cpu);
74extern void check_tsc_sync_target(void); 59extern void check_tsc_sync_target(void);
75 60
76#ifdef CONFIG_X86_64
77extern void tsc_calibrate(void); 61extern void tsc_calibrate(void);
78#endif 62extern int notsc_setup(char *);
79 63
80#endif 64#endif
diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h
index f4ce8768ad44..31d794702719 100644
--- a/include/asm-x86/uaccess_64.h
+++ b/include/asm-x86/uaccess_64.h
@@ -65,6 +65,8 @@ struct exception_table_entry
65 unsigned long insn, fixup; 65 unsigned long insn, fixup;
66}; 66};
67 67
68extern int fixup_exception(struct pt_regs *regs);
69
68#define ARCH_HAS_SEARCH_EXTABLE 70#define ARCH_HAS_SEARCH_EXTABLE
69 71
70/* 72/*
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 9b15545eb9b5..8d8f9b5adbb9 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -333,8 +333,6 @@
333 333
334#ifdef __KERNEL__ 334#ifdef __KERNEL__
335 335
336#define NR_syscalls 325
337
338#define __ARCH_WANT_IPC_PARSE_VERSION 336#define __ARCH_WANT_IPC_PARSE_VERSION
339#define __ARCH_WANT_OLD_READDIR 337#define __ARCH_WANT_OLD_READDIR
340#define __ARCH_WANT_OLD_STAT 338#define __ARCH_WANT_OLD_STAT
diff --git a/include/asm-x86/user_32.h b/include/asm-x86/user_32.h
index 0e85d2a5e33a..ed8b8fc6906c 100644
--- a/include/asm-x86/user_32.h
+++ b/include/asm-x86/user_32.h
@@ -75,13 +75,23 @@ struct user_fxsr_struct {
75 * doesn't use the extra segment registers) 75 * doesn't use the extra segment registers)
76 */ 76 */
77struct user_regs_struct { 77struct user_regs_struct {
78 long ebx, ecx, edx, esi, edi, ebp, eax; 78 unsigned long bx;
79 unsigned short ds, __ds, es, __es; 79 unsigned long cx;
80 unsigned short fs, __fs, gs, __gs; 80 unsigned long dx;
81 long orig_eax, eip; 81 unsigned long si;
82 unsigned short cs, __cs; 82 unsigned long di;
83 long eflags, esp; 83 unsigned long bp;
84 unsigned short ss, __ss; 84 unsigned long ax;
85 unsigned long ds;
86 unsigned long es;
87 unsigned long fs;
88 unsigned long gs;
89 unsigned long orig_ax;
90 unsigned long ip;
91 unsigned long cs;
92 unsigned long flags;
93 unsigned long sp;
94 unsigned long ss;
85}; 95};
86 96
87/* When the kernel dumps core, it starts by dumping the user struct - 97/* When the kernel dumps core, it starts by dumping the user struct -
diff --git a/include/asm-x86/user_64.h b/include/asm-x86/user_64.h
index 12785c649ac5..a5449d456cc0 100644
--- a/include/asm-x86/user_64.h
+++ b/include/asm-x86/user_64.h
@@ -40,13 +40,13 @@
40 * and both the standard and SIMD floating point data can be accessed via 40 * and both the standard and SIMD floating point data can be accessed via
41 * the new ptrace requests. In either case, changes to the FPU environment 41 * the new ptrace requests. In either case, changes to the FPU environment
42 * will be reflected in the task's state as expected. 42 * will be reflected in the task's state as expected.
43 * 43 *
44 * x86-64 support by Andi Kleen. 44 * x86-64 support by Andi Kleen.
45 */ 45 */
46 46
47/* This matches the 64bit FXSAVE format as defined by AMD. It is the same 47/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
48 as the 32bit format defined by Intel, except that the selector:offset pairs for 48 as the 32bit format defined by Intel, except that the selector:offset pairs for
49 data and eip are replaced with flat 64bit pointers. */ 49 data and eip are replaced with flat 64bit pointers. */
50struct user_i387_struct { 50struct user_i387_struct {
51 unsigned short cwd; 51 unsigned short cwd;
52 unsigned short swd; 52 unsigned short swd;
@@ -65,13 +65,34 @@ struct user_i387_struct {
65 * Segment register layout in coredumps. 65 * Segment register layout in coredumps.
66 */ 66 */
67struct user_regs_struct { 67struct user_regs_struct {
68 unsigned long r15,r14,r13,r12,rbp,rbx,r11,r10; 68 unsigned long r15;
69 unsigned long r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax; 69 unsigned long r14;
70 unsigned long rip,cs,eflags; 70 unsigned long r13;
71 unsigned long rsp,ss; 71 unsigned long r12;
72 unsigned long fs_base, gs_base; 72 unsigned long bp;
73 unsigned long ds,es,fs,gs; 73 unsigned long bx;
74}; 74 unsigned long r11;
75 unsigned long r10;
76 unsigned long r9;
77 unsigned long r8;
78 unsigned long ax;
79 unsigned long cx;
80 unsigned long dx;
81 unsigned long si;
82 unsigned long di;
83 unsigned long orig_ax;
84 unsigned long ip;
85 unsigned long cs;
86 unsigned long flags;
87 unsigned long sp;
88 unsigned long ss;
89 unsigned long fs_base;
90 unsigned long gs_base;
91 unsigned long ds;
92 unsigned long es;
93 unsigned long fs;
94 unsigned long gs;
95};
75 96
76/* When the kernel dumps core, it starts by dumping the user struct - 97/* When the kernel dumps core, it starts by dumping the user struct -
77 this will be used by gdb to figure out where the data and stack segments 98 this will be used by gdb to figure out where the data and stack segments
@@ -94,7 +115,7 @@ struct user{
94 This is actually the bottom of the stack, 115 This is actually the bottom of the stack,
95 the top of the stack is always found in the 116 the top of the stack is always found in the
96 esp register. */ 117 esp register. */
97 long int signal; /* Signal that caused the core dump. */ 118 long int signal; /* Signal that caused the core dump. */
98 int reserved; /* No longer used */ 119 int reserved; /* No longer used */
99 int pad1; 120 int pad1;
100 struct user_pt_regs * u_ar0; /* Used by gdb to help find the values for */ 121 struct user_pt_regs * u_ar0; /* Used by gdb to help find the values for */
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h
new file mode 100644
index 000000000000..629bcb6e8e45
--- /dev/null
+++ b/include/asm-x86/vdso.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H 1
3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) ({ \
12 extern const char VDSO64_##name[]; \
13 (void *) (VDSO64_##name - VDSO64_PRELINK + (unsigned long) (base)); })
14#endif
15
16#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
17extern const char VDSO32_PRELINK[];
18
19/*
20 * Given a pointer to the vDSO image, find the pointer to VDSO32_name
21 * as that symbol is defined in the vDSO sources or linker script.
22 */
23#define VDSO32_SYMBOL(base, name) ({ \
24 extern const char VDSO32_##name[]; \
25 (void *) (VDSO32_##name - VDSO32_PRELINK + (unsigned long) (base)); })
26#endif
27
28#endif /* asm-x86/vdso.h */
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h
index f01c49f5d108..17b3700949bf 100644
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -36,6 +36,8 @@ extern volatile unsigned long __jiffies;
36extern int vgetcpu_mode; 36extern int vgetcpu_mode;
37extern struct timezone sys_tz; 37extern struct timezone sys_tz;
38 38
39extern void map_vsyscall(void);
40
39#endif /* __KERNEL__ */ 41#endif /* __KERNEL__ */
40 42
41#endif /* _ASM_X86_64_VSYSCALL_H_ */ 43#endif /* _ASM_X86_64_VSYSCALL_H_ */
diff --git a/include/asm-x86/vsyscall32.h b/include/asm-x86/vsyscall32.h
deleted file mode 100644
index c631c082f8f7..000000000000
--- a/include/asm-x86/vsyscall32.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef _ASM_VSYSCALL32_H
2#define _ASM_VSYSCALL32_H 1
3
4/* Values need to match arch/x86_64/ia32/vsyscall.lds */
5
6#ifdef __ASSEMBLY__
7#define VSYSCALL32_BASE 0xffffe000
8#define VSYSCALL32_SYSEXIT (VSYSCALL32_BASE + 0x410)
9#else
10#define VSYSCALL32_BASE 0xffffe000UL
11#define VSYSCALL32_END (VSYSCALL32_BASE + PAGE_SIZE)
12#define VSYSCALL32_EHDR ((const struct elf32_hdr *) VSYSCALL32_BASE)
13
14#define VSYSCALL32_VSYSCALL ((void *)VSYSCALL32_BASE + 0x400)
15#define VSYSCALL32_SYSEXIT ((void *)VSYSCALL32_BASE + 0x410)
16#define VSYSCALL32_SIGRETURN ((void __user *)VSYSCALL32_BASE + 0x500)
17#define VSYSCALL32_RTSIGRETURN ((void __user *)VSYSCALL32_BASE + 0x600)
18#endif
19
20#endif
diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h
index 23c86cef3b25..a41ef1bdd424 100644
--- a/include/asm-x86/xor_32.h
+++ b/include/asm-x86/xor_32.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/asm-i386/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE. 2 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 * 3 *
6 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h
index f942fcc21831..1eee7fcb2420 100644
--- a/include/asm-x86/xor_64.h
+++ b/include/asm-x86/xor_64.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * include/asm-x86_64/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE. 2 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 * 3 *
6 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 27b9350052b4..85b2482cc736 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,6 @@ header-y += iso_fs.h
100header-y += ixjuser.h 100header-y += ixjuser.h
101header-y += jffs2.h 101header-y += jffs2.h
102header-y += keyctl.h 102header-y += keyctl.h
103header-y += kvm.h
104header-y += limits.h 103header-y += limits.h
105header-y += lock_dlm_plock.h 104header-y += lock_dlm_plock.h
106header-y += magic.h 105header-y += magic.h
@@ -256,6 +255,7 @@ unifdef-y += kd.h
256unifdef-y += kernelcapi.h 255unifdef-y += kernelcapi.h
257unifdef-y += kernel.h 256unifdef-y += kernel.h
258unifdef-y += keyboard.h 257unifdef-y += keyboard.h
258unifdef-$(CONFIG_HAVE_KVM) += kvm.h
259unifdef-y += llc.h 259unifdef-y += llc.h
260unifdef-y += loop.h 260unifdef-y += loop.h
261unifdef-y += lp.h 261unifdef-y += lp.h
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 1d0ef1ae8036..7e3d2859be50 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,6 +25,8 @@ static inline u32 acpi_pm_read_early(void)
25 return acpi_pm_read_verified() & ACPI_PM_MASK; 25 return acpi_pm_read_verified() & ACPI_PM_MASK;
26} 26}
27 27
28extern void pmtimer_wait(unsigned);
29
28#else 30#else
29 31
30static inline u32 acpi_pm_read_early(void) 32static inline u32 acpi_pm_read_early(void)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index c68781692838..bdd6f5de5fc4 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -115,6 +115,8 @@
115#define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ 115#define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */
116#define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ 116#define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */
117#define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ 117#define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */
118#define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */
119#define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */
118 120
119#define AUDIT_FIRST_KERN_ANOM_MSG 1700 121#define AUDIT_FIRST_KERN_ANOM_MSG 1700
120#define AUDIT_LAST_KERN_ANOM_MSG 1799 122#define AUDIT_LAST_KERN_ANOM_MSG 1799
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 107787aacb64..85778a4b1209 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -103,7 +103,7 @@ struct clocksource {
103#define CLOCK_SOURCE_VALID_FOR_HRES 0x20 103#define CLOCK_SOURCE_VALID_FOR_HRES 0x20
104 104
105/* simplify initialization of mask field */ 105/* simplify initialization of mask field */
106#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1) 106#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
107 107
108/** 108/**
109 * clocksource_khz2mult - calculates mult from khz and shift 109 * clocksource_khz2mult - calculates mult from khz and shift
@@ -215,6 +215,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
215 215
216/* used to install a new clocksource */ 216/* used to install a new clocksource */
217extern int clocksource_register(struct clocksource*); 217extern int clocksource_register(struct clocksource*);
218extern void clocksource_unregister(struct clocksource*);
218extern struct clocksource* clocksource_get_next(void); 219extern struct clocksource* clocksource_get_next(void);
219extern void clocksource_change_rating(struct clocksource *cs, int rating); 220extern void clocksource_change_rating(struct clocksource *cs, int rating);
220extern void clocksource_resume(void); 221extern void clocksource_resume(void);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 0e69d2cf14aa..d38655f2be70 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -191,6 +191,10 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
191 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 191 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
192 struct compat_timeval __user *tvp); 192 struct compat_timeval __user *tvp);
193 193
194asmlinkage long compat_sys_wait4(compat_pid_t pid,
195 compat_uint_t *stat_addr, int options,
196 struct compat_rusage *ru);
197
194#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) 198#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t))
195 199
196#define BITS_TO_COMPAT_LONGS(bits) \ 200#define BITS_TO_COMPAT_LONGS(bits) \
@@ -239,6 +243,17 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
239 compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, 243 compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
240 const compat_ulong_t __user *new_nodes); 244 const compat_ulong_t __user *new_nodes);
241 245
246extern int compat_ptrace_request(struct task_struct *child,
247 compat_long_t request,
248 compat_ulong_t addr, compat_ulong_t data);
249
250#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
251extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
252 compat_ulong_t addr, compat_ulong_t data);
253asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
254 compat_long_t addr, compat_long_t data);
255#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
256
242/* 257/*
243 * epoll (fs/eventpoll.c) compat bits follow ... 258 * epoll (fs/eventpoll.c) compat bits follow ...
244 */ 259 */
diff --git a/include/linux/const.h b/include/linux/const.h
index 07b300bfe34b..c22c707c455d 100644
--- a/include/linux/const.h
+++ b/include/linux/const.h
@@ -7,13 +7,18 @@
7 * C code. Therefore we cannot annotate them always with 7 * C code. Therefore we cannot annotate them always with
8 * 'UL' and other type specifiers unilaterally. We 8 * 'UL' and other type specifiers unilaterally. We
9 * use the following macros to deal with this. 9 * use the following macros to deal with this.
10 *
11 * Similarly, _AT() will cast an expression with a type in C, but
12 * leave it unchanged in asm.
10 */ 13 */
11 14
12#ifdef __ASSEMBLY__ 15#ifdef __ASSEMBLY__
13#define _AC(X,Y) X 16#define _AC(X,Y) X
17#define _AT(T,X) X
14#else 18#else
15#define __AC(X,Y) (X##Y) 19#define __AC(X,Y) (X##Y)
16#define _AC(X,Y) __AC(X,Y) 20#define _AC(X,Y) __AC(X,Y)
21#define _AT(T,X) ((T)(X))
17#endif 22#endif
18 23
19#endif /* !(_LINUX_CONST_H) */ 24#endif /* !(_LINUX_CONST_H) */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 85bd790c201e..7047f58306a7 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -218,8 +218,8 @@ int __first_cpu(const cpumask_t *srcp);
218int __next_cpu(int n, const cpumask_t *srcp); 218int __next_cpu(int n, const cpumask_t *srcp);
219#define next_cpu(n, src) __next_cpu((n), &(src)) 219#define next_cpu(n, src) __next_cpu((n), &(src))
220#else 220#else
221#define first_cpu(src) 0 221#define first_cpu(src) ({ (void)(src); 0; })
222#define next_cpu(n, src) 1 222#define next_cpu(n, src) ({ (void)(src); 1; })
223#endif 223#endif
224 224
225#define cpumask_of_cpu(cpu) \ 225#define cpumask_of_cpu(cpu) \
diff --git a/include/linux/device.h b/include/linux/device.h
index 1880208964d6..db375be333c7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,6 +84,9 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
84struct device *bus_find_device(struct bus_type *bus, struct device *start, 84struct device *bus_find_device(struct bus_type *bus, struct device *start,
85 void *data, 85 void *data,
86 int (*match)(struct device *dev, void *data)); 86 int (*match)(struct device *dev, void *data));
87struct device *bus_find_device_by_name(struct bus_type *bus,
88 struct device *start,
89 const char *name);
87 90
88int __must_check bus_for_each_drv(struct bus_type *bus, 91int __must_check bus_for_each_drv(struct bus_type *bus,
89 struct device_driver *start, void *data, 92 struct device_driver *start, void *data,
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 576e83bd6d88..7ceb24d87c1a 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -355,6 +355,7 @@ typedef struct elf64_shdr {
355#define NT_AUXV 6 355#define NT_AUXV 6
356#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ 356#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */
357#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ 357#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */
358#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */
358 359
359 360
360/* Note header in a PT_NOTE section */ 361/* Note header in a PT_NOTE section */
diff --git a/include/linux/hpet.h b/include/linux/hpet.h
index 707f7cb9e795..9cd94bfd07e5 100644
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -64,7 +64,7 @@ struct hpet {
64 */ 64 */
65 65
66#define Tn_INT_ROUTE_CAP_MASK (0xffffffff00000000ULL) 66#define Tn_INT_ROUTE_CAP_MASK (0xffffffff00000000ULL)
67#define Tn_INI_ROUTE_CAP_SHIFT (32UL) 67#define Tn_INT_ROUTE_CAP_SHIFT (32UL)
68#define Tn_FSB_INT_DELCAP_MASK (0x8000UL) 68#define Tn_FSB_INT_DELCAP_MASK (0x8000UL)
69#define Tn_FSB_INT_DELCAP_SHIFT (15) 69#define Tn_FSB_INT_DELCAP_SHIFT (15)
70#define Tn_FSB_EN_CNF_MASK (0x4000UL) 70#define Tn_FSB_EN_CNF_MASK (0x4000UL)
@@ -115,9 +115,6 @@ static inline void hpet_reserve_timer(struct hpet_data *hd, int timer)
115} 115}
116 116
117int hpet_alloc(struct hpet_data *); 117int hpet_alloc(struct hpet_data *);
118int hpet_register(struct hpet_task *, int);
119int hpet_unregister(struct hpet_task *);
120int hpet_control(struct hpet_task *, unsigned int, unsigned long);
121 118
122#endif /* __KERNEL__ */ 119#endif /* __KERNEL__ */
123 120
diff --git a/include/linux/init_ohci1394_dma.h b/include/linux/init_ohci1394_dma.h
new file mode 100644
index 000000000000..3c03a4bba5e4
--- /dev/null
+++ b/include/linux/init_ohci1394_dma.h
@@ -0,0 +1,4 @@
1#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
2extern int __initdata init_ohci1394_dma_early;
3extern void __init init_ohci1394_dma_on_all_controllers(void);
4#endif
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6187a8567bc7..605d237364d2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -8,6 +8,7 @@
8#ifndef _LINUX_IOPORT_H 8#ifndef _LINUX_IOPORT_H
9#define _LINUX_IOPORT_H 9#define _LINUX_IOPORT_H
10 10
11#ifndef __ASSEMBLY__
11#include <linux/compiler.h> 12#include <linux/compiler.h>
12#include <linux/types.h> 13#include <linux/types.h>
13/* 14/*
@@ -153,4 +154,5 @@ extern struct resource * __devm_request_region(struct device *dev,
153extern void __devm_release_region(struct device *dev, struct resource *parent, 154extern void __devm_release_region(struct device *dev, struct resource *parent,
154 resource_size_t start, resource_size_t n); 155 resource_size_t start, resource_size_t n);
155 156
157#endif /* __ASSEMBLY__ */
156#endif /* _LINUX_IOPORT_H */ 158#endif /* _LINUX_IOPORT_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a7283c9beadf..ff356b2ee478 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -194,6 +194,9 @@ static inline int log_buf_read(int idx) { return 0; }
194static inline int log_buf_copy(char *dest, int idx, int len) { return 0; } 194static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
195#endif 195#endif
196 196
197extern void __attribute__((format(printf, 1, 2)))
198 early_printk(const char *fmt, ...);
199
197unsigned long int_sqrt(unsigned long); 200unsigned long int_sqrt(unsigned long);
198 201
199extern int printk_ratelimit(void); 202extern int printk_ratelimit(void);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 81891581e89b..6168c0a44172 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -182,6 +182,15 @@ static inline void kretprobe_assert(struct kretprobe_instance *ri,
182 } 182 }
183} 183}
184 184
185#ifdef CONFIG_KPROBES_SANITY_TEST
186extern int init_test_probes(void);
187#else
188static inline int init_test_probes(void)
189{
190 return 0;
191}
192#endif /* CONFIG_KPROBES_SANITY_TEST */
193
185extern spinlock_t kretprobe_lock; 194extern spinlock_t kretprobe_lock;
186extern struct mutex kprobe_mutex; 195extern struct mutex kprobe_mutex;
187extern int arch_prepare_kprobe(struct kprobe *p); 196extern int arch_prepare_kprobe(struct kprobe *p);
@@ -227,6 +236,7 @@ void unregister_kretprobe(struct kretprobe *rp);
227 236
228void kprobe_flush_task(struct task_struct *tk); 237void kprobe_flush_task(struct task_struct *tk);
229void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); 238void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
239
230#else /* CONFIG_KPROBES */ 240#else /* CONFIG_KPROBES */
231 241
232#define __kprobes /**/ 242#define __kprobes /**/
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 057a7f34ee36..4de4fd2d8607 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -9,12 +9,10 @@
9 9
10#include <asm/types.h> 10#include <asm/types.h>
11#include <linux/ioctl.h> 11#include <linux/ioctl.h>
12#include <asm/kvm.h>
12 13
13#define KVM_API_VERSION 12 14#define KVM_API_VERSION 12
14 15
15/* Architectural interrupt line count. */
16#define KVM_NR_INTERRUPTS 256
17
18/* for KVM_CREATE_MEMORY_REGION */ 16/* for KVM_CREATE_MEMORY_REGION */
19struct kvm_memory_region { 17struct kvm_memory_region {
20 __u32 slot; 18 __u32 slot;
@@ -23,17 +21,19 @@ struct kvm_memory_region {
23 __u64 memory_size; /* bytes */ 21 __u64 memory_size; /* bytes */
24}; 22};
25 23
26/* for kvm_memory_region::flags */ 24/* for KVM_SET_USER_MEMORY_REGION */
27#define KVM_MEM_LOG_DIRTY_PAGES 1UL 25struct kvm_userspace_memory_region {
28 26 __u32 slot;
29struct kvm_memory_alias {
30 __u32 slot; /* this has a different namespace than memory slots */
31 __u32 flags; 27 __u32 flags;
32 __u64 guest_phys_addr; 28 __u64 guest_phys_addr;
33 __u64 memory_size; 29 __u64 memory_size; /* bytes */
34 __u64 target_phys_addr; 30 __u64 userspace_addr; /* start of the userspace allocated memory */
35}; 31};
36 32
33/* for kvm_memory_region::flags */
34#define KVM_MEM_LOG_DIRTY_PAGES 1UL
35
36
37/* for KVM_IRQ_LINE */ 37/* for KVM_IRQ_LINE */
38struct kvm_irq_level { 38struct kvm_irq_level {
39 /* 39 /*
@@ -45,62 +45,18 @@ struct kvm_irq_level {
45 __u32 level; 45 __u32 level;
46}; 46};
47 47
48/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
49struct kvm_pic_state {
50 __u8 last_irr; /* edge detection */
51 __u8 irr; /* interrupt request register */
52 __u8 imr; /* interrupt mask register */
53 __u8 isr; /* interrupt service register */
54 __u8 priority_add; /* highest irq priority */
55 __u8 irq_base;
56 __u8 read_reg_select;
57 __u8 poll;
58 __u8 special_mask;
59 __u8 init_state;
60 __u8 auto_eoi;
61 __u8 rotate_on_auto_eoi;
62 __u8 special_fully_nested_mode;
63 __u8 init4; /* true if 4 byte init */
64 __u8 elcr; /* PIIX edge/trigger selection */
65 __u8 elcr_mask;
66};
67
68#define KVM_IOAPIC_NUM_PINS 24
69struct kvm_ioapic_state {
70 __u64 base_address;
71 __u32 ioregsel;
72 __u32 id;
73 __u32 irr;
74 __u32 pad;
75 union {
76 __u64 bits;
77 struct {
78 __u8 vector;
79 __u8 delivery_mode:3;
80 __u8 dest_mode:1;
81 __u8 delivery_status:1;
82 __u8 polarity:1;
83 __u8 remote_irr:1;
84 __u8 trig_mode:1;
85 __u8 mask:1;
86 __u8 reserve:7;
87 __u8 reserved[4];
88 __u8 dest_id;
89 } fields;
90 } redirtbl[KVM_IOAPIC_NUM_PINS];
91};
92
93#define KVM_IRQCHIP_PIC_MASTER 0
94#define KVM_IRQCHIP_PIC_SLAVE 1
95#define KVM_IRQCHIP_IOAPIC 2
96 48
97struct kvm_irqchip { 49struct kvm_irqchip {
98 __u32 chip_id; 50 __u32 chip_id;
99 __u32 pad; 51 __u32 pad;
100 union { 52 union {
101 char dummy[512]; /* reserving space */ 53 char dummy[512]; /* reserving space */
54#ifdef CONFIG_X86
102 struct kvm_pic_state pic; 55 struct kvm_pic_state pic;
56#endif
57#if defined(CONFIG_X86) || defined(CONFIG_IA64)
103 struct kvm_ioapic_state ioapic; 58 struct kvm_ioapic_state ioapic;
59#endif
104 } chip; 60 } chip;
105}; 61};
106 62
@@ -116,6 +72,7 @@ struct kvm_irqchip {
116#define KVM_EXIT_FAIL_ENTRY 9 72#define KVM_EXIT_FAIL_ENTRY 9
117#define KVM_EXIT_INTR 10 73#define KVM_EXIT_INTR 10
118#define KVM_EXIT_SET_TPR 11 74#define KVM_EXIT_SET_TPR 11
75#define KVM_EXIT_TPR_ACCESS 12
119 76
120/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 77/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
121struct kvm_run { 78struct kvm_run {
@@ -174,90 +131,17 @@ struct kvm_run {
174 __u32 longmode; 131 __u32 longmode;
175 __u32 pad; 132 __u32 pad;
176 } hypercall; 133 } hypercall;
134 /* KVM_EXIT_TPR_ACCESS */
135 struct {
136 __u64 rip;
137 __u32 is_write;
138 __u32 pad;
139 } tpr_access;
177 /* Fix the size of the union. */ 140 /* Fix the size of the union. */
178 char padding[256]; 141 char padding[256];
179 }; 142 };
180}; 143};
181 144
182/* for KVM_GET_REGS and KVM_SET_REGS */
183struct kvm_regs {
184 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
185 __u64 rax, rbx, rcx, rdx;
186 __u64 rsi, rdi, rsp, rbp;
187 __u64 r8, r9, r10, r11;
188 __u64 r12, r13, r14, r15;
189 __u64 rip, rflags;
190};
191
192/* for KVM_GET_FPU and KVM_SET_FPU */
193struct kvm_fpu {
194 __u8 fpr[8][16];
195 __u16 fcw;
196 __u16 fsw;
197 __u8 ftwx; /* in fxsave format */
198 __u8 pad1;
199 __u16 last_opcode;
200 __u64 last_ip;
201 __u64 last_dp;
202 __u8 xmm[16][16];
203 __u32 mxcsr;
204 __u32 pad2;
205};
206
207/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
208#define KVM_APIC_REG_SIZE 0x400
209struct kvm_lapic_state {
210 char regs[KVM_APIC_REG_SIZE];
211};
212
213struct kvm_segment {
214 __u64 base;
215 __u32 limit;
216 __u16 selector;
217 __u8 type;
218 __u8 present, dpl, db, s, l, g, avl;
219 __u8 unusable;
220 __u8 padding;
221};
222
223struct kvm_dtable {
224 __u64 base;
225 __u16 limit;
226 __u16 padding[3];
227};
228
229/* for KVM_GET_SREGS and KVM_SET_SREGS */
230struct kvm_sregs {
231 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
232 struct kvm_segment cs, ds, es, fs, gs, ss;
233 struct kvm_segment tr, ldt;
234 struct kvm_dtable gdt, idt;
235 __u64 cr0, cr2, cr3, cr4, cr8;
236 __u64 efer;
237 __u64 apic_base;
238 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
239};
240
241struct kvm_msr_entry {
242 __u32 index;
243 __u32 reserved;
244 __u64 data;
245};
246
247/* for KVM_GET_MSRS and KVM_SET_MSRS */
248struct kvm_msrs {
249 __u32 nmsrs; /* number of msrs in entries */
250 __u32 pad;
251
252 struct kvm_msr_entry entries[0];
253};
254
255/* for KVM_GET_MSR_INDEX_LIST */
256struct kvm_msr_list {
257 __u32 nmsrs; /* number of msrs in entries */
258 __u32 indices[0];
259};
260
261/* for KVM_TRANSLATE */ 145/* for KVM_TRANSLATE */
262struct kvm_translation { 146struct kvm_translation {
263 /* in */ 147 /* in */
@@ -302,28 +186,24 @@ struct kvm_dirty_log {
302 }; 186 };
303}; 187};
304 188
305struct kvm_cpuid_entry {
306 __u32 function;
307 __u32 eax;
308 __u32 ebx;
309 __u32 ecx;
310 __u32 edx;
311 __u32 padding;
312};
313
314/* for KVM_SET_CPUID */
315struct kvm_cpuid {
316 __u32 nent;
317 __u32 padding;
318 struct kvm_cpuid_entry entries[0];
319};
320
321/* for KVM_SET_SIGNAL_MASK */ 189/* for KVM_SET_SIGNAL_MASK */
322struct kvm_signal_mask { 190struct kvm_signal_mask {
323 __u32 len; 191 __u32 len;
324 __u8 sigset[0]; 192 __u8 sigset[0];
325}; 193};
326 194
195/* for KVM_TPR_ACCESS_REPORTING */
196struct kvm_tpr_access_ctl {
197 __u32 enabled;
198 __u32 flags;
199 __u32 reserved[8];
200};
201
202/* for KVM_SET_VAPIC_ADDR */
203struct kvm_vapic_addr {
204 __u64 vapic_addr;
205};
206
327#define KVMIO 0xAE 207#define KVMIO 0xAE
328 208
329/* 209/*
@@ -347,11 +227,21 @@ struct kvm_signal_mask {
347 */ 227 */
348#define KVM_CAP_IRQCHIP 0 228#define KVM_CAP_IRQCHIP 0
349#define KVM_CAP_HLT 1 229#define KVM_CAP_HLT 1
230#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
231#define KVM_CAP_USER_MEMORY 3
232#define KVM_CAP_SET_TSS_ADDR 4
233#define KVM_CAP_EXT_CPUID 5
234#define KVM_CAP_VAPIC 6
350 235
351/* 236/*
352 * ioctls for VM fds 237 * ioctls for VM fds
353 */ 238 */
354#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 239#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
240#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
241#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
242#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
243 struct kvm_userspace_memory_region)
244#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
355/* 245/*
356 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 246 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
357 * a vcpu fd. 247 * a vcpu fd.
@@ -359,6 +249,7 @@ struct kvm_signal_mask {
359#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 249#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
360#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 250#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
361#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 251#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
252#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
362/* Device model IOC */ 253/* Device model IOC */
363#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 254#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
364#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 255#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@ struct kvm_signal_mask {
384#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) 275#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
385#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) 276#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
386#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) 277#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
278#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
279#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
280/* Available with KVM_CAP_VAPIC */
281#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
282/* Available with KVM_CAP_VAPIC */
283#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
387 284
388#endif 285#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
new file mode 100644
index 000000000000..ea4764b0a2f4
--- /dev/null
+++ b/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
1#ifndef __KVM_HOST_H
2#define __KVM_HOST_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/hardirq.h>
11#include <linux/list.h>
12#include <linux/mutex.h>
13#include <linux/spinlock.h>
14#include <linux/signal.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/preempt.h>
18#include <asm/signal.h>
19
20#include <linux/kvm.h>
21#include <linux/kvm_para.h>
22
23#include <linux/kvm_types.h>
24
25#include <asm/kvm_host.h>
26
27#define KVM_MAX_VCPUS 4
28#define KVM_MEMORY_SLOTS 8
29/* memory slots that does not exposed to userspace */
30#define KVM_PRIVATE_MEM_SLOTS 4
31
32#define KVM_PIO_PAGE_OFFSET 1
33
34/*
35 * vcpu->requests bit members
36 */
37#define KVM_REQ_TLB_FLUSH 0
38#define KVM_REQ_MIGRATE_TIMER 1
39#define KVM_REQ_REPORT_TPR_ACCESS 2
40
41struct kvm_vcpu;
42extern struct kmem_cache *kvm_vcpu_cache;
43
44struct kvm_guest_debug {
45 int enabled;
46 unsigned long bp[4];
47 int singlestep;
48};
49
50/*
51 * It would be nice to use something smarter than a linear search, TBD...
52 * Thankfully we dont expect many devices to register (famous last words :),
53 * so until then it will suffice. At least its abstracted so we can change
54 * in one place.
55 */
56struct kvm_io_bus {
57 int dev_count;
58#define NR_IOBUS_DEVS 6
59 struct kvm_io_device *devs[NR_IOBUS_DEVS];
60};
61
62void kvm_io_bus_init(struct kvm_io_bus *bus);
63void kvm_io_bus_destroy(struct kvm_io_bus *bus);
64struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
65void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
66 struct kvm_io_device *dev);
67
68struct kvm_vcpu {
69 struct kvm *kvm;
70 struct preempt_notifier preempt_notifier;
71 int vcpu_id;
72 struct mutex mutex;
73 int cpu;
74 struct kvm_run *run;
75 int guest_mode;
76 unsigned long requests;
77 struct kvm_guest_debug guest_debug;
78 int fpu_active;
79 int guest_fpu_loaded;
80 wait_queue_head_t wq;
81 int sigset_active;
82 sigset_t sigset;
83 struct kvm_vcpu_stat stat;
84
85#ifdef CONFIG_HAS_IOMEM
86 int mmio_needed;
87 int mmio_read_completed;
88 int mmio_is_write;
89 int mmio_size;
90 unsigned char mmio_data[8];
91 gpa_t mmio_phys_addr;
92#endif
93
94 struct kvm_vcpu_arch arch;
95};
96
97struct kvm_memory_slot {
98 gfn_t base_gfn;
99 unsigned long npages;
100 unsigned long flags;
101 unsigned long *rmap;
102 unsigned long *dirty_bitmap;
103 unsigned long userspace_addr;
104 int user_alloc;
105};
106
107struct kvm {
108 struct mutex lock; /* protects the vcpus array and APIC accesses */
109 spinlock_t mmu_lock;
110 struct mm_struct *mm; /* userspace tied to this vm */
111 int nmemslots;
112 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
113 KVM_PRIVATE_MEM_SLOTS];
114 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
115 struct list_head vm_list;
116 struct file *filp;
117 struct kvm_io_bus mmio_bus;
118 struct kvm_io_bus pio_bus;
119 struct kvm_vm_stat stat;
120 struct kvm_arch arch;
121};
122
123/* The guest did something we don't support. */
124#define pr_unimpl(vcpu, fmt, ...) \
125 do { \
126 if (printk_ratelimit()) \
127 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
128 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
129 } while (0)
130
131#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
132#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
133
134int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
135void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
136
137void vcpu_load(struct kvm_vcpu *vcpu);
138void vcpu_put(struct kvm_vcpu *vcpu);
139
140void decache_vcpus_on_cpu(int cpu);
141
142
143int kvm_init(void *opaque, unsigned int vcpu_size,
144 struct module *module);
145void kvm_exit(void);
146
147#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
148#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
149static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
150struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
151
152extern struct page *bad_page;
153
154int is_error_page(struct page *page);
155int kvm_is_error_hva(unsigned long addr);
156int kvm_set_memory_region(struct kvm *kvm,
157 struct kvm_userspace_memory_region *mem,
158 int user_alloc);
159int __kvm_set_memory_region(struct kvm *kvm,
160 struct kvm_userspace_memory_region *mem,
161 int user_alloc);
162int kvm_arch_set_memory_region(struct kvm *kvm,
163 struct kvm_userspace_memory_region *mem,
164 struct kvm_memory_slot old,
165 int user_alloc);
166gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
167struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
168void kvm_release_page_clean(struct page *page);
169void kvm_release_page_dirty(struct page *page);
170int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
171 int len);
172int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
173 unsigned long len);
174int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
175int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
176 int offset, int len);
177int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
178 unsigned long len);
179int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
180int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
181struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
182int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
183void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
184
185void kvm_vcpu_block(struct kvm_vcpu *vcpu);
186void kvm_resched(struct kvm_vcpu *vcpu);
187void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
188void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
189void kvm_flush_remote_tlbs(struct kvm *kvm);
190
191long kvm_arch_dev_ioctl(struct file *filp,
192 unsigned int ioctl, unsigned long arg);
193long kvm_arch_vcpu_ioctl(struct file *filp,
194 unsigned int ioctl, unsigned long arg);
195void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
196void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
197
198int kvm_dev_ioctl_check_extension(long ext);
199
200int kvm_get_dirty_log(struct kvm *kvm,
201 struct kvm_dirty_log *log, int *is_dirty);
202int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
203 struct kvm_dirty_log *log);
204
205int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
206 struct
207 kvm_userspace_memory_region *mem,
208 int user_alloc);
209long kvm_arch_vm_ioctl(struct file *filp,
210 unsigned int ioctl, unsigned long arg);
211void kvm_arch_destroy_vm(struct kvm *kvm);
212
213int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
214int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
215
216int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
217 struct kvm_translation *tr);
218
219int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
220int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
221int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
222 struct kvm_sregs *sregs);
223int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
224 struct kvm_sregs *sregs);
225int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
226 struct kvm_debug_guest *dbg);
227int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
228
229int kvm_arch_init(void *opaque);
230void kvm_arch_exit(void);
231
232int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
233void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
234
235void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
236void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
237void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
238struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
239int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
240void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
241
242int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
243void kvm_arch_hardware_enable(void *garbage);
244void kvm_arch_hardware_disable(void *garbage);
245int kvm_arch_hardware_setup(void);
246void kvm_arch_hardware_unsetup(void);
247void kvm_arch_check_processor_compat(void *rtn);
248int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
249
250void kvm_free_physmem(struct kvm *kvm);
251
252struct kvm *kvm_arch_create_vm(void);
253void kvm_arch_destroy_vm(struct kvm *kvm);
254
255int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
256int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
257void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
258
259static inline void kvm_guest_enter(void)
260{
261 account_system_vtime(current);
262 current->flags |= PF_VCPU;
263}
264
265static inline void kvm_guest_exit(void)
266{
267 account_system_vtime(current);
268 current->flags &= ~PF_VCPU;
269}
270
271static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
272{
273 return slot - kvm->memslots;
274}
275
276static inline gpa_t gfn_to_gpa(gfn_t gfn)
277{
278 return (gpa_t)gfn << PAGE_SHIFT;
279}
280
281static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
282{
283 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
284}
285
286enum kvm_stat_kind {
287 KVM_STAT_VM,
288 KVM_STAT_VCPU,
289};
290
291struct kvm_stats_debugfs_item {
292 const char *name;
293 int offset;
294 enum kvm_stat_kind kind;
295 struct dentry *dentry;
296};
297extern struct kvm_stats_debugfs_item debugfs_entries[];
298
299#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b292565a693..5497aac0d2f8 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
2#define __LINUX_KVM_PARA_H 2#define __LINUX_KVM_PARA_H
3 3
4/* 4/*
5 * Guest OS interface for KVM paravirtualization 5 * This header file provides a method for making a hypercall to the host
6 * 6 * Architectures should define:
7 * Note: this interface is totally experimental, and is certain to change 7 * - kvm_hypercall0, kvm_hypercall1...
8 * as we make progress. 8 * - kvm_arch_para_features
9 * - kvm_para_available
9 */ 10 */
10 11
11/* 12/* Return values for hypercalls */
12 * Per-VCPU descriptor area shared between guest and host. Writable to 13#define KVM_ENOSYS 1000
13 * both guest and host. Registered with the host by the guest when
14 * a guest acknowledges paravirtual mode.
15 *
16 * NOTE: all addresses are guest-physical addresses (gpa), to make it
17 * easier for the hypervisor to map between the various addresses.
18 */
19struct kvm_vcpu_para_state {
20 /*
21 * API version information for compatibility. If there's any support
22 * mismatch (too old host trying to execute too new guest) then
23 * the host will deny entry into paravirtual mode. Any other
24 * combination (new host + old guest and new host + new guest)
25 * is supposed to work - new host versions will support all old
26 * guest API versions.
27 */
28 u32 guest_version;
29 u32 host_version;
30 u32 size;
31 u32 ret;
32
33 /*
34 * The address of the vm exit instruction (VMCALL or VMMCALL),
35 * which the host will patch according to the CPU model the
36 * VM runs on:
37 */
38 u64 hypercall_gpa;
39
40} __attribute__ ((aligned(PAGE_SIZE)));
41
42#define KVM_PARA_API_VERSION 1
43
44/*
45 * This is used for an RDMSR's ECX parameter to probe for a KVM host.
46 * Hopefully no CPU vendor will use up this number. This is placed well
47 * out of way of the typical space occupied by CPU vendors' MSR indices,
48 * and we think (or at least hope) it wont be occupied in the future
49 * either.
50 */
51#define MSR_KVM_API_MAGIC 0x87655678
52 14
53#define KVM_EINVAL 1 15#define KVM_HC_VAPIC_POLL_IRQ 1
54 16
55/* 17/*
56 * Hypercall calling convention: 18 * hypercalls use architecture specific
57 *
58 * Each hypercall may have 0-6 parameters.
59 *
60 * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
61 *
62 * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
63 * order: RDI, RSI, RDX, RCX, R8, R9.
64 *
65 * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
66 * (the first 3 are according to the gcc regparm calling convention)
67 *
68 * No registers are clobbered by the hypercall, except that the
69 * return value is in RAX.
70 */ 19 */
71#define __NR_hypercalls 0 20#include <asm/kvm_para.h>
21
22#ifdef __KERNEL__
23static inline int kvm_para_has_feature(unsigned int feature)
24{
25 if (kvm_arch_para_features() & (1UL << feature))
26 return 1;
27 return 0;
28}
29#endif /* __KERNEL__ */
30#endif /* __LINUX_KVM_PARA_H */
72 31
73#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
new file mode 100644
index 000000000000..1c4e46decb22
--- /dev/null
+++ b/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 */
16
17#ifndef __KVM_TYPES_H__
18#define __KVM_TYPES_H__
19
20#include <asm/types.h>
21
22/*
23 * Address types:
24 *
25 * gva - guest virtual address
26 * gpa - guest physical address
27 * gfn - guest frame number
28 * hva - host virtual address
29 * hpa - host physical address
30 * hfn - host frame number
31 */
32
33typedef unsigned long gva_t;
34typedef u64 gpa_t;
35typedef unsigned long gfn_t;
36
37typedef unsigned long hva_t;
38typedef u64 hpa_t;
39typedef unsigned long hfn_t;
40
41struct kvm_pio_request {
42 unsigned long count;
43 int cur_count;
44 struct page *guest_pages[2];
45 unsigned guest_page_offset;
46 int in;
47 int port;
48 int size;
49 int string;
50 int down;
51 int rep;
52};
53
54#endif /* __KVM_TYPES_H__ */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index ff203dd02919..3faf599ea58e 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -13,6 +13,10 @@
13#define asmlinkage CPP_ASMLINKAGE 13#define asmlinkage CPP_ASMLINKAGE
14#endif 14#endif
15 15
16#ifndef asmregparm
17# define asmregparm
18#endif
19
16#ifndef prevent_tail_call 20#ifndef prevent_tail_call
17# define prevent_tail_call(ret) do { } while (0) 21# define prevent_tail_call(ret) do { } while (0)
18#endif 22#endif
@@ -53,6 +57,10 @@
53 .size name, .-name 57 .size name, .-name
54#endif 58#endif
55 59
60/* If symbol 'name' is treated as a subroutine (gets called, and returns)
61 * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of
62 * static analysis tools such as stack depth analyzer.
63 */
56#ifndef ENDPROC 64#ifndef ENDPROC
57#define ENDPROC(name) \ 65#define ENDPROC(name) \
58 .type name, @function; \ 66 .type name, @function; \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1897ca223eca..1bba6789a50a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1118,9 +1118,21 @@ static inline void vm_stat_account(struct mm_struct *mm,
1118} 1118}
1119#endif /* CONFIG_PROC_FS */ 1119#endif /* CONFIG_PROC_FS */
1120 1120
1121#ifndef CONFIG_DEBUG_PAGEALLOC 1121#ifdef CONFIG_DEBUG_PAGEALLOC
1122extern int debug_pagealloc_enabled;
1123
1124extern void kernel_map_pages(struct page *page, int numpages, int enable);
1125
1126static inline void enable_debug_pagealloc(void)
1127{
1128 debug_pagealloc_enabled = 1;
1129}
1130#else
1122static inline void 1131static inline void
1123kernel_map_pages(struct page *page, int numpages, int enable) {} 1132kernel_map_pages(struct page *page, int numpages, int enable) {}
1133static inline void enable_debug_pagealloc(void)
1134{
1135}
1124#endif 1136#endif
1125 1137
1126extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); 1138extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
@@ -1146,6 +1158,7 @@ extern int randomize_va_space;
1146#endif 1158#endif
1147 1159
1148const char * arch_vma_name(struct vm_area_struct *vma); 1160const char * arch_vma_name(struct vm_area_struct *vma);
1161void print_vma_addr(char *prefix, unsigned long rip);
1149 1162
1150struct page *sparse_mem_map_populate(unsigned long pnum, int nid); 1163struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
1151pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); 1164pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c69531348363..41f6f28690f6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2085,6 +2085,13 @@
2085#define PCI_VENDOR_ID_BELKIN 0x1799 2085#define PCI_VENDOR_ID_BELKIN 0x1799
2086#define PCI_DEVICE_ID_BELKIN_F5D7010V7 0x701f 2086#define PCI_DEVICE_ID_BELKIN_F5D7010V7 0x701f
2087 2087
2088#define PCI_VENDOR_ID_RDC 0x17f3
2089#define PCI_DEVICE_ID_RDC_R6020 0x6020
2090#define PCI_DEVICE_ID_RDC_R6030 0x6030
2091#define PCI_DEVICE_ID_RDC_R6040 0x6040
2092#define PCI_DEVICE_ID_RDC_R6060 0x6060
2093#define PCI_DEVICE_ID_RDC_R6061 0x6061
2094
2088#define PCI_VENDOR_ID_LENOVO 0x17aa 2095#define PCI_VENDOR_ID_LENOVO 0x17aa
2089 2096
2090#define PCI_VENDOR_ID_ARECA 0x17d3 2097#define PCI_VENDOR_ID_ARECA 0x17d3
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 926adaae0f96..00412bb494c4 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,6 +9,30 @@
9 9
10#include <asm/percpu.h> 10#include <asm/percpu.h>
11 11
12#ifndef PER_CPU_ATTRIBUTES
13#define PER_CPU_ATTRIBUTES
14#endif
15
16#ifdef CONFIG_SMP
17#define DEFINE_PER_CPU(type, name) \
18 __attribute__((__section__(".data.percpu"))) \
19 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
20
21#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
22 __attribute__((__section__(".data.percpu.shared_aligned"))) \
23 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \
24 ____cacheline_aligned_in_smp
25#else
26#define DEFINE_PER_CPU(type, name) \
27 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
28
29#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
30 DEFINE_PER_CPU(type, name)
31#endif
32
33#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
34#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
35
12/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ 36/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
13#ifndef PERCPU_ENOUGH_ROOM 37#ifndef PERCPU_ENOUGH_ROOM
14#ifdef CONFIG_MODULES 38#ifdef CONFIG_MODULES
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 3ea5750a0f7e..515bff053de8 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -129,6 +129,81 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data);
129#define force_successful_syscall_return() do { } while (0) 129#define force_successful_syscall_return() do { } while (0)
130#endif 130#endif
131 131
132/*
133 * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
134 *
135 * These do-nothing inlines are used when the arch does not
136 * implement single-step. The kerneldoc comments are here
137 * to document the interface for all arch definitions.
138 */
139
140#ifndef arch_has_single_step
141/**
142 * arch_has_single_step - does this CPU support user-mode single-step?
143 *
144 * If this is defined, then there must be function declarations or
145 * inlines for user_enable_single_step() and user_disable_single_step().
146 * arch_has_single_step() should evaluate to nonzero iff the machine
147 * supports instruction single-step for user mode.
148 * It can be a constant or it can test a CPU feature bit.
149 */
150#define arch_has_single_step() (0)
151
152/**
153 * user_enable_single_step - single-step in user-mode task
154 * @task: either current or a task stopped in %TASK_TRACED
155 *
156 * This can only be called when arch_has_single_step() has returned nonzero.
157 * Set @task so that when it returns to user mode, it will trap after the
158 * next single instruction executes. If arch_has_block_step() is defined,
159 * this must clear the effects of user_enable_block_step() too.
160 */
161static inline void user_enable_single_step(struct task_struct *task)
162{
163 BUG(); /* This can never be called. */
164}
165
166/**
167 * user_disable_single_step - cancel user-mode single-step
168 * @task: either current or a task stopped in %TASK_TRACED
169 *
170 * Clear @task of the effects of user_enable_single_step() and
171 * user_enable_block_step(). This can be called whether or not either
172 * of those was ever called on @task, and even if arch_has_single_step()
173 * returned zero.
174 */
175static inline void user_disable_single_step(struct task_struct *task)
176{
177}
178#endif /* arch_has_single_step */
179
180#ifndef arch_has_block_step
181/**
182 * arch_has_block_step - does this CPU support user-mode block-step?
183 *
184 * If this is defined, then there must be a function declaration or inline
185 * for user_enable_block_step(), and arch_has_single_step() must be defined
186 * too. arch_has_block_step() should evaluate to nonzero iff the machine
187 * supports step-until-branch for user mode. It can be a constant or it
188 * can test a CPU feature bit.
189 */
190#define arch_has_block_step() (0)
191
192/**
193 * user_enable_block_step - step until branch in user-mode task
194 * @task: either current or a task stopped in %TASK_TRACED
195 *
196 * This can only be called when arch_has_block_step() has returned nonzero,
197 * and will never be called when single-instruction stepping is being used.
198 * Set @task so that when it returns to user mode, it will trap after the
199 * next branch or trap taken.
200 */
201static inline void user_enable_block_step(struct task_struct *task)
202{
203 BUG(); /* This can never be called. */
204}
205#endif /* arch_has_block_step */
206
132#endif 207#endif
133 208
134#endif 209#endif
diff --git a/include/linux/regset.h b/include/linux/regset.h
new file mode 100644
index 000000000000..8abee6556223
--- /dev/null
+++ b/include/linux/regset.h
@@ -0,0 +1,368 @@
1/*
2 * User-mode machine state access
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 */
12
13#ifndef _LINUX_REGSET_H
14#define _LINUX_REGSET_H 1
15
16#include <linux/compiler.h>
17#include <linux/types.h>
18#include <linux/uaccess.h>
19struct task_struct;
20struct user_regset;
21
22
23/**
24 * user_regset_active_fn - type of @active function in &struct user_regset
25 * @target: thread being examined
26 * @regset: regset being examined
27 *
28 * Return -%ENODEV if not available on the hardware found.
29 * Return %0 if no interesting state in this thread.
30 * Return >%0 number of @size units of interesting state.
31 * Any get call fetching state beyond that number will
32 * see the default initialization state for this data,
33 * so a caller that knows what the default state is need
34 * not copy it all out.
35 * This call is optional; the pointer is %NULL if there
36 * is no inexpensive check to yield a value < @n.
37 */
38typedef int user_regset_active_fn(struct task_struct *target,
39 const struct user_regset *regset);
40
41/**
42 * user_regset_get_fn - type of @get function in &struct user_regset
43 * @target: thread being examined
44 * @regset: regset being examined
45 * @pos: offset into the regset data to access, in bytes
46 * @count: amount of data to copy, in bytes
47 * @kbuf: if not %NULL, a kernel-space pointer to copy into
48 * @ubuf: if @kbuf is %NULL, a user-space pointer to copy into
49 *
50 * Fetch register values. Return %0 on success; -%EIO or -%ENODEV
51 * are usual failure returns. The @pos and @count values are in
52 * bytes, but must be properly aligned. If @kbuf is non-null, that
53 * buffer is used and @ubuf is ignored. If @kbuf is %NULL, then
54 * ubuf gives a userland pointer to access directly, and an -%EFAULT
55 * return value is possible.
56 */
57typedef int user_regset_get_fn(struct task_struct *target,
58 const struct user_regset *regset,
59 unsigned int pos, unsigned int count,
60 void *kbuf, void __user *ubuf);
61
62/**
63 * user_regset_set_fn - type of @set function in &struct user_regset
64 * @target: thread being examined
65 * @regset: regset being examined
66 * @pos: offset into the regset data to access, in bytes
67 * @count: amount of data to copy, in bytes
68 * @kbuf: if not %NULL, a kernel-space pointer to copy from
69 * @ubuf: if @kbuf is %NULL, a user-space pointer to copy from
70 *
71 * Store register values. Return %0 on success; -%EIO or -%ENODEV
72 * are usual failure returns. The @pos and @count values are in
73 * bytes, but must be properly aligned. If @kbuf is non-null, that
74 * buffer is used and @ubuf is ignored. If @kbuf is %NULL, then
75 * ubuf gives a userland pointer to access directly, and an -%EFAULT
76 * return value is possible.
77 */
78typedef int user_regset_set_fn(struct task_struct *target,
79 const struct user_regset *regset,
80 unsigned int pos, unsigned int count,
81 const void *kbuf, const void __user *ubuf);
82
83/**
84 * user_regset_writeback_fn - type of @writeback function in &struct user_regset
85 * @target: thread being examined
86 * @regset: regset being examined
87 * @immediate: zero if writeback at completion of next context switch is OK
88 *
89 * This call is optional; usually the pointer is %NULL. When
90 * provided, there is some user memory associated with this regset's
91 * hardware, such as memory backing cached register data on register
92 * window machines; the regset's data controls what user memory is
93 * used (e.g. via the stack pointer value).
94 *
95 * Write register data back to user memory. If the @immediate flag
96 * is nonzero, it must be written to the user memory so uaccess or
97 * access_process_vm() can see it when this call returns; if zero,
98 * then it must be written back by the time the task completes a
99 * context switch (as synchronized with wait_task_inactive()).
100 * Return %0 on success or if there was nothing to do, -%EFAULT for
101 * a memory problem (bad stack pointer or whatever), or -%EIO for a
102 * hardware problem.
103 */
104typedef int user_regset_writeback_fn(struct task_struct *target,
105 const struct user_regset *regset,
106 int immediate);
107
108/**
109 * struct user_regset - accessible thread CPU state
110 * @n: Number of slots (registers).
111 * @size: Size in bytes of a slot (register).
112 * @align: Required alignment, in bytes.
113 * @bias: Bias from natural indexing.
114 * @core_note_type: ELF note @n_type value used in core dumps.
115 * @get: Function to fetch values.
116 * @set: Function to store values.
117 * @active: Function to report if regset is active, or %NULL.
118 * @writeback: Function to write data back to user memory, or %NULL.
119 *
120 * This data structure describes a machine resource we call a register set.
121 * This is part of the state of an individual thread, not necessarily
122 * actual CPU registers per se. A register set consists of a number of
123 * similar slots, given by @n. Each slot is @size bytes, and aligned to
124 * @align bytes (which is at least @size).
125 *
126 * These functions must be called only on the current thread or on a
127 * thread that is in %TASK_STOPPED or %TASK_TRACED state, that we are
128 * guaranteed will not be woken up and return to user mode, and that we
129 * have called wait_task_inactive() on. (The target thread always might
130 * wake up for SIGKILL while these functions are working, in which case
131 * that thread's user_regset state might be scrambled.)
132 *
133 * The @pos argument must be aligned according to @align; the @count
134 * argument must be a multiple of @size. These functions are not
135 * responsible for checking for invalid arguments.
136 *
137 * When there is a natural value to use as an index, @bias gives the
138 * difference between the natural index and the slot index for the
139 * register set. For example, x86 GDT segment descriptors form a regset;
140 * the segment selector produces a natural index, but only a subset of
141 * that index space is available as a regset (the TLS slots); subtracting
142 * @bias from a segment selector index value computes the regset slot.
143 *
144 * If nonzero, @core_note_type gives the n_type field (NT_* value)
145 * of the core file note in which this regset's data appears.
146 * NT_PRSTATUS is a special case in that the regset data starts at
147 * offsetof(struct elf_prstatus, pr_reg) into the note data; that is
148 * part of the per-machine ELF formats userland knows about. In
149 * other cases, the core file note contains exactly the whole regset
150 * (@n * @size) and nothing else. The core file note is normally
151 * omitted when there is an @active function and it returns zero.
152 */
153struct user_regset {
154 user_regset_get_fn *get;
155 user_regset_set_fn *set;
156 user_regset_active_fn *active;
157 user_regset_writeback_fn *writeback;
158 unsigned int n;
159 unsigned int size;
160 unsigned int align;
161 unsigned int bias;
162 unsigned int core_note_type;
163};
164
165/**
166 * struct user_regset_view - available regsets
167 * @name: Identifier, e.g. UTS_MACHINE string.
168 * @regsets: Array of @n regsets available in this view.
169 * @n: Number of elements in @regsets.
170 * @e_machine: ELF header @e_machine %EM_* value written in core dumps.
171 * @e_flags: ELF header @e_flags value written in core dumps.
172 * @ei_osabi: ELF header @e_ident[%EI_OSABI] value written in core dumps.
173 *
174 * A regset view is a collection of regsets (&struct user_regset,
175 * above). This describes all the state of a thread that can be seen
176 * from a given architecture/ABI environment. More than one view might
177 * refer to the same &struct user_regset, or more than one regset
178 * might refer to the same machine-specific state in the thread. For
179 * example, a 32-bit thread's state could be examined from the 32-bit
180 * view or from the 64-bit view. Either method reaches the same thread
181 * register state, doing appropriate widening or truncation.
182 */
183struct user_regset_view {
184 const char *name;
185 const struct user_regset *regsets;
186 unsigned int n;
187 u32 e_flags;
188 u16 e_machine;
189 u8 ei_osabi;
190};
191
192/*
193 * This is documented here rather than at the definition sites because its
194 * implementation is machine-dependent but its interface is universal.
195 */
196/**
197 * task_user_regset_view - Return the process's native regset view.
198 * @tsk: a thread of the process in question
199 *
200 * Return the &struct user_regset_view that is native for the given process.
201 * For example, what it would access when it called ptrace().
202 * Throughout the life of the process, this only changes at exec.
203 */
204const struct user_regset_view *task_user_regset_view(struct task_struct *tsk);
205
206
207/*
208 * These are helpers for writing regset get/set functions in arch code.
209 * Because @start_pos and @end_pos are always compile-time constants,
210 * these are inlined into very little code though they look large.
211 *
212 * Use one or more calls sequentially for each chunk of regset data stored
213 * contiguously in memory. Call with constants for @start_pos and @end_pos,
214 * giving the range of byte positions in the regset that data corresponds
215 * to; @end_pos can be -1 if this chunk is at the end of the regset layout.
216 * Each call updates the arguments to point past its chunk.
217 */
218
219static inline int user_regset_copyout(unsigned int *pos, unsigned int *count,
220 void **kbuf,
221 void __user **ubuf, const void *data,
222 const int start_pos, const int end_pos)
223{
224 if (*count == 0)
225 return 0;
226 BUG_ON(*pos < start_pos);
227 if (end_pos < 0 || *pos < end_pos) {
228 unsigned int copy = (end_pos < 0 ? *count
229 : min(*count, end_pos - *pos));
230 data += *pos - start_pos;
231 if (*kbuf) {
232 memcpy(*kbuf, data, copy);
233 *kbuf += copy;
234 } else if (__copy_to_user(*ubuf, data, copy))
235 return -EFAULT;
236 else
237 *ubuf += copy;
238 *pos += copy;
239 *count -= copy;
240 }
241 return 0;
242}
243
244static inline int user_regset_copyin(unsigned int *pos, unsigned int *count,
245 const void **kbuf,
246 const void __user **ubuf, void *data,
247 const int start_pos, const int end_pos)
248{
249 if (*count == 0)
250 return 0;
251 BUG_ON(*pos < start_pos);
252 if (end_pos < 0 || *pos < end_pos) {
253 unsigned int copy = (end_pos < 0 ? *count
254 : min(*count, end_pos - *pos));
255 data += *pos - start_pos;
256 if (*kbuf) {
257 memcpy(data, *kbuf, copy);
258 *kbuf += copy;
259 } else if (__copy_from_user(data, *ubuf, copy))
260 return -EFAULT;
261 else
262 *ubuf += copy;
263 *pos += copy;
264 *count -= copy;
265 }
266 return 0;
267}
268
269/*
270 * These two parallel the two above, but for portions of a regset layout
271 * that always read as all-zero or for which writes are ignored.
272 */
273static inline int user_regset_copyout_zero(unsigned int *pos,
274 unsigned int *count,
275 void **kbuf, void __user **ubuf,
276 const int start_pos,
277 const int end_pos)
278{
279 if (*count == 0)
280 return 0;
281 BUG_ON(*pos < start_pos);
282 if (end_pos < 0 || *pos < end_pos) {
283 unsigned int copy = (end_pos < 0 ? *count
284 : min(*count, end_pos - *pos));
285 if (*kbuf) {
286 memset(*kbuf, 0, copy);
287 *kbuf += copy;
288 } else if (__clear_user(*ubuf, copy))
289 return -EFAULT;
290 else
291 *ubuf += copy;
292 *pos += copy;
293 *count -= copy;
294 }
295 return 0;
296}
297
298static inline int user_regset_copyin_ignore(unsigned int *pos,
299 unsigned int *count,
300 const void **kbuf,
301 const void __user **ubuf,
302 const int start_pos,
303 const int end_pos)
304{
305 if (*count == 0)
306 return 0;
307 BUG_ON(*pos < start_pos);
308 if (end_pos < 0 || *pos < end_pos) {
309 unsigned int copy = (end_pos < 0 ? *count
310 : min(*count, end_pos - *pos));
311 if (*kbuf)
312 *kbuf += copy;
313 else
314 *ubuf += copy;
315 *pos += copy;
316 *count -= copy;
317 }
318 return 0;
319}
320
321/**
322 * copy_regset_to_user - fetch a thread's user_regset data into user memory
323 * @target: thread to be examined
324 * @view: &struct user_regset_view describing user thread machine state
325 * @setno: index in @view->regsets
326 * @offset: offset into the regset data, in bytes
327 * @size: amount of data to copy, in bytes
328 * @data: user-mode pointer to copy into
329 */
330static inline int copy_regset_to_user(struct task_struct *target,
331 const struct user_regset_view *view,
332 unsigned int setno,
333 unsigned int offset, unsigned int size,
334 void __user *data)
335{
336 const struct user_regset *regset = &view->regsets[setno];
337
338 if (!access_ok(VERIFY_WRITE, data, size))
339 return -EIO;
340
341 return regset->get(target, regset, offset, size, NULL, data);
342}
343
344/**
345 * copy_regset_from_user - store into thread's user_regset data from user memory
346 * @target: thread to be examined
347 * @view: &struct user_regset_view describing user thread machine state
348 * @setno: index in @view->regsets
349 * @offset: offset into the regset data, in bytes
350 * @size: amount of data to copy, in bytes
351 * @data: user-mode pointer to copy from
352 */
353static inline int copy_regset_from_user(struct task_struct *target,
354 const struct user_regset_view *view,
355 unsigned int setno,
356 unsigned int offset, unsigned int size,
357 const void __user *data)
358{
359 const struct user_regset *regset = &view->regsets[setno];
360
361 if (!access_ok(VERIFY_READ, data, size))
362 return -EIO;
363
364 return regset->set(target, regset, offset, size, NULL, data);
365}
366
367
368#endif /* <linux/regset.h> */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2d0546e884ea..9d4797609aa5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1922,23 +1922,16 @@ extern int cond_resched_softirq(void);
1922 1922
1923/* 1923/*
1924 * Does a critical section need to be broken due to another 1924 * Does a critical section need to be broken due to another
1925 * task waiting?: 1925 * task waiting?: (technically does not depend on CONFIG_PREEMPT,
1926 * but a general need for low latency)
1926 */ 1927 */
1927#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 1928static inline int spin_needbreak(spinlock_t *lock)
1928# define need_lockbreak(lock) ((lock)->break_lock)
1929#else
1930# define need_lockbreak(lock) 0
1931#endif
1932
1933/*
1934 * Does a critical section need to be broken due to another
1935 * task waiting or preemption being signalled:
1936 */
1937static inline int lock_need_resched(spinlock_t *lock)
1938{ 1929{
1939 if (need_lockbreak(lock) || need_resched()) 1930#ifdef CONFIG_PREEMPT
1940 return 1; 1931 return spin_is_contended(lock);
1932#else
1941 return 0; 1933 return 0;
1934#endif
1942} 1935}
1943 1936
1944/* 1937/*
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 6080f73fc85f..8c2cc4c02526 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -120,16 +120,35 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
120int selinux_string_to_sid(char *str, u32 *sid); 120int selinux_string_to_sid(char *str, u32 *sid);
121 121
122/** 122/**
123 * selinux_relabel_packet_permission - check permission to relabel a packet 123 * selinux_secmark_relabel_packet_permission - secmark permission check
124 * @sid: ID value to be applied to network packet (via SECMARK, most likely) 124 * @sid: SECMARK ID value to be applied to network packet
125 * 125 *
126 * Returns 0 if the current task is allowed to label packets with the 126 * Returns 0 if the current task is allowed to set the SECMARK label of
127 * supplied security ID. Note that it is implicit that the packet is always 127 * packets with the supplied security ID. Note that it is implicit that
128 * being relabeled from the default unlabled value, and that the access 128 * the packet is always being relabeled from the default unlabeled value,
129 * control decision is made in the AVC. 129 * and that the access control decision is made in the AVC.
130 */ 130 */
131int selinux_relabel_packet_permission(u32 sid); 131int selinux_secmark_relabel_packet_permission(u32 sid);
132 132
133/**
134 * selinux_secmark_refcount_inc - increments the secmark use counter
135 *
136 * SELinux keeps track of the current SECMARK targets in use so it knows
137 * when to apply SECMARK label access checks to network packets. This
138 * function incements this reference count to indicate that a new SECMARK
139 * target has been configured.
140 */
141void selinux_secmark_refcount_inc(void);
142
143/**
144 * selinux_secmark_refcount_dec - decrements the secmark use counter
145 *
146 * SELinux keeps track of the current SECMARK targets in use so it knows
147 * when to apply SECMARK label access checks to network packets. This
148 * function decements this reference count to indicate that one of the
149 * existing SECMARK targets has been removed/flushed.
150 */
151void selinux_secmark_refcount_dec(void);
133#else 152#else
134 153
135static inline int selinux_audit_rule_init(u32 field, u32 op, 154static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -184,11 +203,21 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid)
184 return 0; 203 return 0;
185} 204}
186 205
187static inline int selinux_relabel_packet_permission(u32 sid) 206static inline int selinux_secmark_relabel_packet_permission(u32 sid)
188{ 207{
189 return 0; 208 return 0;
190} 209}
191 210
211static inline void selinux_secmark_refcount_inc(void)
212{
213 return;
214}
215
216static inline void selinux_secmark_refcount_dec(void)
217{
218 return;
219}
220
192#endif /* CONFIG_SECURITY_SELINUX */ 221#endif /* CONFIG_SECURITY_SELINUX */
193 222
194#endif /* _LINUX_SELINUX_H */ 223#endif /* _LINUX_SELINUX_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c25e66bcecf3..55232ccf9cfd 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -78,6 +78,8 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
78 */ 78 */
79void smp_prepare_boot_cpu(void); 79void smp_prepare_boot_cpu(void);
80 80
81extern unsigned int setup_max_cpus;
82
81#else /* !SMP */ 83#else /* !SMP */
82 84
83/* 85/*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index c376f3b36c89..124449733c55 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,6 +120,12 @@ do { \
120 120
121#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) 121#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock)
122 122
123#ifdef CONFIG_GENERIC_LOCKBREAK
124#define spin_is_contended(lock) ((lock)->break_lock)
125#else
126#define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock)
127#endif
128
123/** 129/**
124 * spin_unlock_wait - wait until the spinlock gets unlocked 130 * spin_unlock_wait - wait until the spinlock gets unlocked
125 * @lock: the spinlock in question. 131 * @lock: the spinlock in question.
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index f6a3a951b79e..68d88f71f1a2 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -19,7 +19,7 @@
19 19
20typedef struct { 20typedef struct {
21 raw_spinlock_t raw_lock; 21 raw_spinlock_t raw_lock;
22#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 22#ifdef CONFIG_GENERIC_LOCKBREAK
23 unsigned int break_lock; 23 unsigned int break_lock;
24#endif 24#endif
25#ifdef CONFIG_DEBUG_SPINLOCK 25#ifdef CONFIG_DEBUG_SPINLOCK
@@ -35,7 +35,7 @@ typedef struct {
35 35
36typedef struct { 36typedef struct {
37 raw_rwlock_t raw_lock; 37 raw_rwlock_t raw_lock;
38#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 38#ifdef CONFIG_GENERIC_LOCKBREAK
39 unsigned int break_lock; 39 unsigned int break_lock;
40#endif 40#endif
41#ifdef CONFIG_DEBUG_SPINLOCK 41#ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index ea54c4c9a4ec..938234c4a996 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -64,6 +64,8 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
64# define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) 64# define __raw_spin_trylock(lock) ({ (void)(lock); 1; })
65#endif /* DEBUG_SPINLOCK */ 65#endif /* DEBUG_SPINLOCK */
66 66
67#define __raw_spin_is_contended(lock) (((void)(lock), 0))
68
67#define __raw_read_can_lock(lock) (((void)(lock), 1)) 69#define __raw_read_can_lock(lock) (((void)(lock), 1))
68#define __raw_write_can_lock(lock) (((void)(lock), 1)) 70#define __raw_write_can_lock(lock) (((void)(lock), 1))
69 71
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 4360e0816956..40280df2a3db 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -211,9 +211,6 @@ static inline int hibernate(void) { return -ENOSYS; }
211#ifdef CONFIG_PM_SLEEP 211#ifdef CONFIG_PM_SLEEP
212void save_processor_state(void); 212void save_processor_state(void);
213void restore_processor_state(void); 213void restore_processor_state(void);
214struct saved_context;
215void __save_processor_state(struct saved_context *ctxt);
216void __restore_processor_state(struct saved_context *ctxt);
217 214
218/* kernel/power/main.c */ 215/* kernel/power/main.c */
219extern struct blocking_notifier_head pm_chain_head; 216extern struct blocking_notifier_head pm_chain_head;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4f3838adbb30..2c3ce4c69b25 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -6,6 +6,7 @@
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/list.h> 7#include <linux/list.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/pagemap.h>
9 10
10#include <asm/atomic.h> 11#include <asm/atomic.h>
11#include <asm/page.h> 12#include <asm/page.h>
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9c4ad755d7e5..dfbdfb9836f4 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,27 +42,27 @@ extern long do_no_restart_syscall(struct restart_block *parm);
42 42
43static inline void set_ti_thread_flag(struct thread_info *ti, int flag) 43static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
44{ 44{
45 set_bit(flag,&ti->flags); 45 set_bit(flag, (unsigned long *)&ti->flags);
46} 46}
47 47
48static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) 48static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
49{ 49{
50 clear_bit(flag,&ti->flags); 50 clear_bit(flag, (unsigned long *)&ti->flags);
51} 51}
52 52
53static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) 53static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
54{ 54{
55 return test_and_set_bit(flag,&ti->flags); 55 return test_and_set_bit(flag, (unsigned long *)&ti->flags);
56} 56}
57 57
58static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) 58static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
59{ 59{
60 return test_and_clear_bit(flag,&ti->flags); 60 return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
61} 61}
62 62
63static inline int test_ti_thread_flag(struct thread_info *ti, int flag) 63static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
64{ 64{
65 return test_bit(flag,&ti->flags); 65 return test_bit(flag, (unsigned long *)&ti->flags);
66} 66}
67 67
68#define set_thread_flag(flag) \ 68#define set_thread_flag(flag) \
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f4a1395e05ff..0fadf95debe1 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -51,8 +51,10 @@ struct tick_sched {
51 unsigned long idle_jiffies; 51 unsigned long idle_jiffies;
52 unsigned long idle_calls; 52 unsigned long idle_calls;
53 unsigned long idle_sleeps; 53 unsigned long idle_sleeps;
54 int idle_active;
54 ktime_t idle_entrytime; 55 ktime_t idle_entrytime;
55 ktime_t idle_sleeptime; 56 ktime_t idle_sleeptime;
57 ktime_t idle_lastupdate;
56 ktime_t sleep_length; 58 ktime_t sleep_length;
57 unsigned long last_jiffies; 59 unsigned long last_jiffies;
58 unsigned long next_jiffies; 60 unsigned long next_jiffies;
@@ -103,6 +105,8 @@ extern void tick_nohz_stop_sched_tick(void);
103extern void tick_nohz_restart_sched_tick(void); 105extern void tick_nohz_restart_sched_tick(void);
104extern void tick_nohz_update_jiffies(void); 106extern void tick_nohz_update_jiffies(void);
105extern ktime_t tick_nohz_get_sleep_length(void); 107extern ktime_t tick_nohz_get_sleep_length(void);
108extern void tick_nohz_stop_idle(int cpu);
109extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
106# else 110# else
107static inline void tick_nohz_stop_sched_tick(void) { } 111static inline void tick_nohz_stop_sched_tick(void) { }
108static inline void tick_nohz_restart_sched_tick(void) { } 112static inline void tick_nohz_restart_sched_tick(void) { }
@@ -113,6 +117,8 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
113 117
114 return len; 118 return len;
115} 119}
120static inline void tick_nohz_stop_idle(int cpu) { }
121static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; }
116# endif /* !NO_HZ */ 122# endif /* !NO_HZ */
117 123
118#endif 124#endif
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 78cf899b4409..de0e71359ede 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,7 +5,7 @@
5#include <linux/ktime.h> 5#include <linux/ktime.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8struct tvec_t_base_s; 8struct tvec_base;
9 9
10struct timer_list { 10struct timer_list {
11 struct list_head entry; 11 struct list_head entry;
@@ -14,7 +14,7 @@ struct timer_list {
14 void (*function)(unsigned long); 14 void (*function)(unsigned long);
15 unsigned long data; 15 unsigned long data;
16 16
17 struct tvec_t_base_s *base; 17 struct tvec_base *base;
18#ifdef CONFIG_TIMER_STATS 18#ifdef CONFIG_TIMER_STATS
19 void *start_site; 19 void *start_site;
20 char start_comm[16]; 20 char start_comm[16];
@@ -22,7 +22,7 @@ struct timer_list {
22#endif 22#endif
23}; 23};
24 24
25extern struct tvec_t_base_s boot_tvec_bases; 25extern struct tvec_base boot_tvec_bases;
26 26
27#define TIMER_INITIALIZER(_function, _expires, _data) { \ 27#define TIMER_INITIALIZER(_function, _expires, _data) { \
28 .function = (_function), \ 28 .function = (_function), \
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 2e5b2f6f9fa0..b3213c7c5309 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -67,7 +67,11 @@
67 * NetLabel NETLINK protocol 67 * NetLabel NETLINK protocol
68 */ 68 */
69 69
70#define NETLBL_PROTO_VERSION 1 70/* NetLabel NETLINK protocol version
71 * 1: initial version
72 * 2: added static labels for unlabeled connections
73 */
74#define NETLBL_PROTO_VERSION 2
71 75
72/* NetLabel NETLINK types/families */ 76/* NetLabel NETLINK types/families */
73#define NETLBL_NLTYPE_NONE 0 77#define NETLBL_NLTYPE_NONE 0
@@ -105,17 +109,49 @@ struct netlbl_dom_map;
105/* Domain mapping operations */ 109/* Domain mapping operations */
106int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); 110int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
107 111
108/* LSM security attributes */ 112/*
113 * LSM security attributes
114 */
115
116/**
117 * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
118 * @refcount: atomic reference counter
119 * @free: LSM supplied function to free the cache data
120 * @data: LSM supplied cache data
121 *
122 * Description:
123 * This structure is provided for LSMs which wish to make use of the NetLabel
124 * caching mechanism to store LSM specific data/attributes in the NetLabel
125 * cache. If the LSM has to perform a lot of translation from the NetLabel
126 * security attributes into it's own internal representation then the cache
127 * mechanism can provide a way to eliminate some or all of that translation
128 * overhead on a cache hit.
129 *
130 */
109struct netlbl_lsm_cache { 131struct netlbl_lsm_cache {
110 atomic_t refcount; 132 atomic_t refcount;
111 void (*free) (const void *data); 133 void (*free) (const void *data);
112 void *data; 134 void *data;
113}; 135};
114/* The catmap bitmap field MUST be a power of two in length and large 136
137/**
138 * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap
139 * @startbit: the value of the lowest order bit in the bitmap
140 * @bitmap: the category bitmap
141 * @next: pointer to the next bitmap "node" or NULL
142 *
143 * Description:
144 * This structure is used to represent category bitmaps. Due to the large
145 * number of categories supported by most labeling protocols it is not
146 * practical to transfer a full bitmap internally so NetLabel adopts a sparse
147 * bitmap structure modeled after SELinux's ebitmap structure.
148 * The catmap bitmap field MUST be a power of two in length and large
115 * enough to hold at least 240 bits. Special care (i.e. check the code!) 149 * enough to hold at least 240 bits. Special care (i.e. check the code!)
116 * should be used when changing these values as the LSM implementation 150 * should be used when changing these values as the LSM implementation
117 * probably has functions which rely on the sizes of these types to speed 151 * probably has functions which rely on the sizes of these types to speed
118 * processing. */ 152 * processing.
153 *
154 */
119#define NETLBL_CATMAP_MAPTYPE u64 155#define NETLBL_CATMAP_MAPTYPE u64
120#define NETLBL_CATMAP_MAPCNT 4 156#define NETLBL_CATMAP_MAPCNT 4
121#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) 157#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
@@ -127,22 +163,48 @@ struct netlbl_lsm_secattr_catmap {
127 NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; 163 NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
128 struct netlbl_lsm_secattr_catmap *next; 164 struct netlbl_lsm_secattr_catmap *next;
129}; 165};
166
167/**
168 * struct netlbl_lsm_secattr - NetLabel LSM security attributes
169 * @flags: indicate which attributes are contained in this structure
170 * @type: indicate the NLTYPE of the attributes
171 * @domain: the NetLabel LSM domain
172 * @cache: NetLabel LSM specific cache
173 * @attr.mls: MLS sensitivity label
174 * @attr.mls.cat: MLS category bitmap
175 * @attr.mls.lvl: MLS sensitivity level
176 * @attr.secid: LSM specific secid token
177 *
178 * Description:
179 * This structure is used to pass security attributes between NetLabel and the
180 * LSM modules. The flags field is used to specify which fields within the
181 * struct are valid and valid values can be created by bitwise OR'ing the
182 * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to
183 * specify domain specific configuration settings and is not usually used by
184 * NetLabel itself when returning security attributes to the LSM.
185 *
186 */
130#define NETLBL_SECATTR_NONE 0x00000000 187#define NETLBL_SECATTR_NONE 0x00000000
131#define NETLBL_SECATTR_DOMAIN 0x00000001 188#define NETLBL_SECATTR_DOMAIN 0x00000001
132#define NETLBL_SECATTR_CACHE 0x00000002 189#define NETLBL_SECATTR_CACHE 0x00000002
133#define NETLBL_SECATTR_MLS_LVL 0x00000004 190#define NETLBL_SECATTR_MLS_LVL 0x00000004
134#define NETLBL_SECATTR_MLS_CAT 0x00000008 191#define NETLBL_SECATTR_MLS_CAT 0x00000008
192#define NETLBL_SECATTR_SECID 0x00000010
135#define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ 193#define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \
136 NETLBL_SECATTR_MLS_CAT) 194 NETLBL_SECATTR_MLS_CAT | \
195 NETLBL_SECATTR_SECID)
137struct netlbl_lsm_secattr { 196struct netlbl_lsm_secattr {
138 u32 flags; 197 u32 flags;
139 198 u32 type;
140 char *domain; 199 char *domain;
141
142 u32 mls_lvl;
143 struct netlbl_lsm_secattr_catmap *mls_cat;
144
145 struct netlbl_lsm_cache *cache; 200 struct netlbl_lsm_cache *cache;
201 union {
202 struct {
203 struct netlbl_lsm_secattr_catmap *cat;
204 u32 lvl;
205 } mls;
206 u32 secid;
207 } attr;
146}; 208};
147 209
148/* 210/*
@@ -231,10 +293,7 @@ static inline void netlbl_secattr_catmap_free(
231 */ 293 */
232static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) 294static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
233{ 295{
234 secattr->flags = 0; 296 memset(secattr, 0, sizeof(*secattr));
235 secattr->domain = NULL;
236 secattr->mls_cat = NULL;
237 secattr->cache = NULL;
238} 297}
239 298
240/** 299/**
@@ -248,11 +307,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
248 */ 307 */
249static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) 308static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
250{ 309{
251 if (secattr->cache)
252 netlbl_secattr_cache_free(secattr->cache);
253 kfree(secattr->domain); 310 kfree(secattr->domain);
254 if (secattr->mls_cat) 311 if (secattr->flags & NETLBL_SECATTR_CACHE)
255 netlbl_secattr_catmap_free(secattr->mls_cat); 312 netlbl_secattr_cache_free(secattr->cache);
313 if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
314 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
256} 315}
257 316
258/** 317/**
@@ -300,7 +359,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap,
300 gfp_t flags); 359 gfp_t flags);
301 360
302/* 361/*
303 * LSM protocol operations 362 * LSM protocol operations (NetLabel LSM/kernel API)
304 */ 363 */
305int netlbl_enabled(void); 364int netlbl_enabled(void);
306int netlbl_sock_setattr(struct sock *sk, 365int netlbl_sock_setattr(struct sock *sk,
@@ -308,6 +367,7 @@ int netlbl_sock_setattr(struct sock *sk,
308int netlbl_sock_getattr(struct sock *sk, 367int netlbl_sock_getattr(struct sock *sk,
309 struct netlbl_lsm_secattr *secattr); 368 struct netlbl_lsm_secattr *secattr);
310int netlbl_skbuff_getattr(const struct sk_buff *skb, 369int netlbl_skbuff_getattr(const struct sk_buff *skb,
370 u16 family,
311 struct netlbl_lsm_secattr *secattr); 371 struct netlbl_lsm_secattr *secattr);
312void netlbl_skbuff_err(struct sk_buff *skb, int error); 372void netlbl_skbuff_err(struct sk_buff *skb, int error);
313 373
@@ -360,6 +420,7 @@ static inline int netlbl_sock_getattr(struct sock *sk,
360 return -ENOSYS; 420 return -ENOSYS;
361} 421}
362static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, 422static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
423 u16 family,
363 struct netlbl_lsm_secattr *secattr) 424 struct netlbl_lsm_secattr *secattr)
364{ 425{
365 return -ENOSYS; 426 return -ENOSYS;
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 702fcfeb37f1..82251575a9b4 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -11,6 +11,25 @@
11#include <linux/types.h> 11#include <linux/types.h>
12 12
13/* 13/*
14 * The maximum number of SG segments that we will put inside a
15 * scatterlist (unless chaining is used). Should ideally fit inside a
16 * single page, to avoid a higher order allocation. We could define this
17 * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The
18 * minimum value is 32
19 */
20#define SCSI_MAX_SG_SEGMENTS 128
21
22/*
23 * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
24 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
25 */
26#ifdef ARCH_HAS_SG_CHAIN
27#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
28#else
29#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS
30#endif
31
32/*
14 * SCSI command lengths 33 * SCSI command lengths
15 */ 34 */
16 35
@@ -83,6 +102,7 @@ extern const unsigned char scsi_command_size[8];
83#define READ_TOC 0x43 102#define READ_TOC 0x43
84#define LOG_SELECT 0x4c 103#define LOG_SELECT 0x4c
85#define LOG_SENSE 0x4d 104#define LOG_SENSE 0x4d
105#define XDWRITEREAD_10 0x53
86#define MODE_SELECT_10 0x55 106#define MODE_SELECT_10 0x55
87#define RESERVE_10 0x56 107#define RESERVE_10 0x56
88#define RELEASE_10 0x57 108#define RELEASE_10 0x57
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a457fca66f61..de28aab820b0 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -2,15 +2,20 @@
2#define _SCSI_SCSI_CMND_H 2#define _SCSI_SCSI_CMND_H
3 3
4#include <linux/dma-mapping.h> 4#include <linux/dma-mapping.h>
5#include <linux/blkdev.h>
5#include <linux/list.h> 6#include <linux/list.h>
6#include <linux/types.h> 7#include <linux/types.h>
7#include <linux/timer.h> 8#include <linux/timer.h>
8#include <linux/scatterlist.h> 9#include <linux/scatterlist.h>
9 10
10struct request;
11struct Scsi_Host; 11struct Scsi_Host;
12struct scsi_device; 12struct scsi_device;
13 13
14struct scsi_data_buffer {
15 struct sg_table table;
16 unsigned length;
17 int resid;
18};
14 19
15/* embedded in scsi_cmnd */ 20/* embedded in scsi_cmnd */
16struct scsi_pointer { 21struct scsi_pointer {
@@ -61,15 +66,11 @@ struct scsi_cmnd {
61 /* These elements define the operation we are about to perform */ 66 /* These elements define the operation we are about to perform */
62#define MAX_COMMAND_SIZE 16 67#define MAX_COMMAND_SIZE 16
63 unsigned char cmnd[MAX_COMMAND_SIZE]; 68 unsigned char cmnd[MAX_COMMAND_SIZE];
64 unsigned request_bufflen; /* Actual request size */
65 69
66 struct timer_list eh_timeout; /* Used to time out the command. */ 70 struct timer_list eh_timeout; /* Used to time out the command. */
67 void *request_buffer; /* Actual requested buffer */
68 71
69 /* These elements define the operation we ultimately want to perform */ 72 /* These elements define the operation we ultimately want to perform */
70 struct sg_table sg_table; 73 struct scsi_data_buffer sdb;
71 unsigned short use_sg; /* Number of pieces of scatter-gather */
72
73 unsigned underflow; /* Return error if less than 74 unsigned underflow; /* Return error if less than
74 this amount is transferred */ 75 this amount is transferred */
75 76
@@ -79,10 +80,6 @@ struct scsi_cmnd {
79 reconnects. Probably == sector 80 reconnects. Probably == sector
80 size */ 81 size */
81 82
82 int resid; /* Number of bytes requested to be
83 transferred less actual number
84 transferred (0 if not supported) */
85
86 struct request *request; /* The command we are 83 struct request *request; /* The command we are
87 working on */ 84 working on */
88 85
@@ -127,27 +124,55 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
127 size_t *offset, size_t *len); 124 size_t *offset, size_t *len);
128extern void scsi_kunmap_atomic_sg(void *virt); 125extern void scsi_kunmap_atomic_sg(void *virt);
129 126
130extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t); 127extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask);
131extern void scsi_free_sgtable(struct scsi_cmnd *); 128extern void scsi_release_buffers(struct scsi_cmnd *cmd);
132 129
133extern int scsi_dma_map(struct scsi_cmnd *cmd); 130extern int scsi_dma_map(struct scsi_cmnd *cmd);
134extern void scsi_dma_unmap(struct scsi_cmnd *cmd); 131extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
135 132
136#define scsi_sg_count(cmd) ((cmd)->use_sg) 133static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
137#define scsi_sglist(cmd) ((cmd)->sg_table.sgl) 134{
138#define scsi_bufflen(cmd) ((cmd)->request_bufflen) 135 return cmd->sdb.table.nents;
136}
137
138static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
139{
140 return cmd->sdb.table.sgl;
141}
142
143static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
144{
145 return cmd->sdb.length;
146}
139 147
140static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) 148static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
141{ 149{
142 cmd->resid = resid; 150 cmd->sdb.resid = resid;
143} 151}
144 152
145static inline int scsi_get_resid(struct scsi_cmnd *cmd) 153static inline int scsi_get_resid(struct scsi_cmnd *cmd)
146{ 154{
147 return cmd->resid; 155 return cmd->sdb.resid;
148} 156}
149 157
150#define scsi_for_each_sg(cmd, sg, nseg, __i) \ 158#define scsi_for_each_sg(cmd, sg, nseg, __i) \
151 for_each_sg(scsi_sglist(cmd), sg, nseg, __i) 159 for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
152 160
161static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
162{
163 return blk_bidi_rq(cmd->request) &&
164 (cmd->request->next_rq->special != NULL);
165}
166
167static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
168{
169 return scsi_bidi_cmnd(cmd) ?
170 cmd->request->next_rq->special : &cmd->sdb;
171}
172
173static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
174{
175 return &cmd->sdb;
176}
177
153#endif /* _SCSI_SCSI_CMND_H */ 178#endif /* _SCSI_SCSI_CMND_H */
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h
index d21b8913ceb3..25071d5d9bf8 100644
--- a/include/scsi/scsi_eh.h
+++ b/include/scsi/scsi_eh.h
@@ -68,16 +68,15 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
68extern int scsi_reset_provider(struct scsi_device *, int); 68extern int scsi_reset_provider(struct scsi_device *, int);
69 69
70struct scsi_eh_save { 70struct scsi_eh_save {
71 /* saved state */
71 int result; 72 int result;
72 enum dma_data_direction data_direction; 73 enum dma_data_direction data_direction;
73 unsigned char cmd_len; 74 unsigned char cmd_len;
74 unsigned char cmnd[MAX_COMMAND_SIZE]; 75 unsigned char cmnd[MAX_COMMAND_SIZE];
76 struct scsi_data_buffer sdb;
77 struct request *next_rq;
75 78
76 void *buffer; 79 /* new command support */
77 unsigned bufflen;
78 unsigned short use_sg;
79 int resid;
80
81 struct scatterlist sense_sgl; 80 struct scatterlist sense_sgl;
82}; 81};
83 82
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 0fd4746ee39d..5c58d594126a 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -39,9 +39,6 @@ struct blk_queue_tags;
39#define DISABLE_CLUSTERING 0 39#define DISABLE_CLUSTERING 0
40#define ENABLE_CLUSTERING 1 40#define ENABLE_CLUSTERING 1
41 41
42#define DISABLE_SG_CHAINING 0
43#define ENABLE_SG_CHAINING 1
44
45enum scsi_eh_timer_return { 42enum scsi_eh_timer_return {
46 EH_NOT_HANDLED, 43 EH_NOT_HANDLED,
47 EH_HANDLED, 44 EH_HANDLED,
@@ -136,9 +133,9 @@ struct scsi_host_template {
136 * the done callback is invoked. 133 * the done callback is invoked.
137 * 134 *
138 * This is called to inform the LLD to transfer 135 * This is called to inform the LLD to transfer
139 * cmd->request_bufflen bytes. The cmd->use_sg speciefies the 136 * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the
140 * number of scatterlist entried in the command and 137 * number of scatterlist entried in the command and
141 * cmd->request_buffer contains the scatterlist. 138 * scsi_sglist(cmd) returns the scatterlist.
142 * 139 *
143 * return values: see queuecommand 140 * return values: see queuecommand
144 * 141 *
@@ -446,15 +443,6 @@ struct scsi_host_template {
446 unsigned ordered_tag:1; 443 unsigned ordered_tag:1;
447 444
448 /* 445 /*
449 * true if the low-level driver can support sg chaining. this
450 * will be removed eventually when all the drivers are
451 * converted to support sg chaining.
452 *
453 * Status: OBSOLETE
454 */
455 unsigned use_sg_chaining:1;
456
457 /*
458 * Countdown for host blocking with no commands outstanding 446 * Countdown for host blocking with no commands outstanding
459 */ 447 */
460 unsigned int max_host_blocked; 448 unsigned int max_host_blocked;
@@ -598,7 +586,6 @@ struct Scsi_Host {
598 unsigned unchecked_isa_dma:1; 586 unsigned unchecked_isa_dma:1;
599 unsigned use_clustering:1; 587 unsigned use_clustering:1;
600 unsigned use_blk_tcq:1; 588 unsigned use_blk_tcq:1;
601 unsigned use_sg_chaining:1;
602 589
603 /* 590 /*
604 * Host has requested that no further requests come through for the 591 * Host has requested that no further requests come through for the
diff --git a/include/xen/page.h b/include/xen/page.h
index c0c8fcb27899..031ef22a971e 100644
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -156,16 +156,16 @@ static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
156 156
157static inline unsigned long long pte_val_ma(pte_t x) 157static inline unsigned long long pte_val_ma(pte_t x)
158{ 158{
159 return ((unsigned long long)x.pte_high << 32) | x.pte_low; 159 return x.pte;
160} 160}
161#define pmd_val_ma(v) ((v).pmd) 161#define pmd_val_ma(v) ((v).pmd)
162#define pud_val_ma(v) ((v).pgd.pgd) 162#define pud_val_ma(v) ((v).pgd.pgd)
163#define __pte_ma(x) ((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } ) 163#define __pte_ma(x) ((pte_t) { .pte = (x) })
164#define __pmd_ma(x) ((pmd_t) { (x) } ) 164#define __pmd_ma(x) ((pmd_t) { (x) } )
165#else /* !X86_PAE */ 165#else /* !X86_PAE */
166#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) 166#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
167#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) 167#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
168#define pte_val_ma(x) ((x).pte_low) 168#define pte_val_ma(x) ((x).pte)
169#define pmd_val_ma(v) ((v).pud.pgd.pgd) 169#define pmd_val_ma(v) ((v).pud.pgd.pgd)
170#define __pte_ma(x) ((pte_t) { (x) } ) 170#define __pte_ma(x) ((pte_t) { (x) } )
171#endif /* CONFIG_X86_PAE */ 171#endif /* CONFIG_X86_PAE */
diff --git a/init/main.c b/init/main.c
index f287ca5862b9..cb81ed116f62 100644
--- a/init/main.c
+++ b/init/main.c
@@ -128,7 +128,7 @@ static char *ramdisk_execute_command;
128 128
129#ifdef CONFIG_SMP 129#ifdef CONFIG_SMP
130/* Setup configured maximum number of CPUs to activate */ 130/* Setup configured maximum number of CPUs to activate */
131static unsigned int __initdata max_cpus = NR_CPUS; 131unsigned int __initdata setup_max_cpus = NR_CPUS;
132 132
133/* 133/*
134 * Setup routine for controlling SMP activation 134 * Setup routine for controlling SMP activation
@@ -146,7 +146,7 @@ static inline void disable_ioapic_setup(void) {};
146 146
147static int __init nosmp(char *str) 147static int __init nosmp(char *str)
148{ 148{
149 max_cpus = 0; 149 setup_max_cpus = 0;
150 disable_ioapic_setup(); 150 disable_ioapic_setup();
151 return 0; 151 return 0;
152} 152}
@@ -155,8 +155,8 @@ early_param("nosmp", nosmp);
155 155
156static int __init maxcpus(char *str) 156static int __init maxcpus(char *str)
157{ 157{
158 get_option(&str, &max_cpus); 158 get_option(&str, &setup_max_cpus);
159 if (max_cpus == 0) 159 if (setup_max_cpus == 0)
160 disable_ioapic_setup(); 160 disable_ioapic_setup();
161 161
162 return 0; 162 return 0;
@@ -164,7 +164,7 @@ static int __init maxcpus(char *str)
164 164
165early_param("maxcpus", maxcpus); 165early_param("maxcpus", maxcpus);
166#else 166#else
167#define max_cpus NR_CPUS 167#define setup_max_cpus NR_CPUS
168#endif 168#endif
169 169
170/* 170/*
@@ -318,6 +318,10 @@ static int __init unknown_bootoption(char *param, char *val)
318 return 0; 318 return 0;
319} 319}
320 320
321#ifdef CONFIG_DEBUG_PAGEALLOC
322int __read_mostly debug_pagealloc_enabled = 0;
323#endif
324
321static int __init init_setup(char *str) 325static int __init init_setup(char *str)
322{ 326{
323 unsigned int i; 327 unsigned int i;
@@ -363,7 +367,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
363 367
364#else 368#else
365 369
366#ifdef __GENERIC_PER_CPU 370#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
367unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 371unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
368 372
369EXPORT_SYMBOL(__per_cpu_offset); 373EXPORT_SYMBOL(__per_cpu_offset);
@@ -384,7 +388,7 @@ static void __init setup_per_cpu_areas(void)
384 ptr += size; 388 ptr += size;
385 } 389 }
386} 390}
387#endif /* !__GENERIC_PER_CPU */ 391#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
388 392
389/* Called by boot processor to activate the rest. */ 393/* Called by boot processor to activate the rest. */
390static void __init smp_init(void) 394static void __init smp_init(void)
@@ -393,7 +397,7 @@ static void __init smp_init(void)
393 397
394 /* FIXME: This should be done in userspace --RR */ 398 /* FIXME: This should be done in userspace --RR */
395 for_each_present_cpu(cpu) { 399 for_each_present_cpu(cpu) {
396 if (num_online_cpus() >= max_cpus) 400 if (num_online_cpus() >= setup_max_cpus)
397 break; 401 break;
398 if (!cpu_online(cpu)) 402 if (!cpu_online(cpu))
399 cpu_up(cpu); 403 cpu_up(cpu);
@@ -401,7 +405,7 @@ static void __init smp_init(void)
401 405
402 /* Any cleanup work */ 406 /* Any cleanup work */
403 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 407 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
404 smp_cpus_done(max_cpus); 408 smp_cpus_done(setup_max_cpus);
405} 409}
406 410
407#endif 411#endif
@@ -552,6 +556,7 @@ asmlinkage void __init start_kernel(void)
552 preempt_disable(); 556 preempt_disable();
553 build_all_zonelists(); 557 build_all_zonelists();
554 page_alloc_init(); 558 page_alloc_init();
559 enable_debug_pagealloc();
555 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); 560 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
556 parse_early_param(); 561 parse_early_param();
557 parse_args("Booting kernel", static_command_line, __start___param, 562 parse_args("Booting kernel", static_command_line, __start___param,
@@ -824,7 +829,7 @@ static int __init kernel_init(void * unused)
824 __set_special_pids(1, 1); 829 __set_special_pids(1, 1);
825 cad_pid = task_pid(current); 830 cad_pid = task_pid(current);
826 831
827 smp_prepare_cpus(max_cpus); 832 smp_prepare_cpus(setup_max_cpus);
828 833
829 do_pre_smp_initcalls(); 834 do_pre_smp_initcalls();
830 835
diff --git a/kernel/Makefile b/kernel/Makefile
index 390d42146267..8885627ea021 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
36obj-$(CONFIG_PM) += power/ 36obj-$(CONFIG_PM) += power/
37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
38obj-$(CONFIG_KEXEC) += kexec.o 38obj-$(CONFIG_KEXEC) += kexec.o
39obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
39obj-$(CONFIG_COMPAT) += compat.o 40obj-$(CONFIG_COMPAT) += compat.o
40obj-$(CONFIG_CGROUPS) += cgroup.o 41obj-$(CONFIG_CGROUPS) += cgroup.o
41obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
43obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
44obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
45obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
46obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
47obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 49obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
48obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 50obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000000..d1a7605c5b8f
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
1/*
2 * Simple stack backtrace regression test module
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#include <linux/module.h>
14#include <linux/sched.h>
15#include <linux/delay.h>
16
17static struct timer_list backtrace_timer;
18
19static void backtrace_test_timer(unsigned long data)
20{
21 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack();
24}
25static int backtrace_regression_test(void)
26{
27 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31
32 init_timer(&backtrace_timer);
33 backtrace_timer.function = backtrace_test_timer;
34 mod_timer(&backtrace_timer, jiffies + 10);
35
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n");
38 return 0;
39}
40
41static void exitf(void)
42{
43}
44
45module_init(backtrace_regression_test);
46module_exit(exitf);
47MODULE_LICENSE("GPL");
48MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/fork.c b/kernel/fork.c
index 314f5101d2b0..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
393 destroy_context(mm); 393 destroy_context(mm);
394 free_mm(mm); 394 free_mm(mm);
395} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
396 397
397/* 398/*
398 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f314221d534..438a01464287 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
479 return; 479 return;
480 } 480 }
481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); 481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
482#ifdef CONFIG_DEBUG_SHIRQ
483 dump_stack();
484#endif
482 spin_unlock_irqrestore(&desc->lock, flags); 485 spin_unlock_irqrestore(&desc->lock, flags);
483 return; 486 return;
484 } 487 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b98046a..c2f2ccb0549a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
75 75
76#endif 76#endif
77 77
78static int irq_spurious_read(char *page, char **start, off_t off,
79 int count, int *eof, void *data)
80{
81 struct irq_desc *d = &irq_desc[(long) data];
82 return sprintf(page, "count %u\n"
83 "unhandled %u\n"
84 "last_unhandled %u ms\n",
85 d->irq_count,
86 d->irqs_unhandled,
87 jiffies_to_msecs(d->last_unhandled));
88}
89
78#define MAX_NAMELEN 128 90#define MAX_NAMELEN 128
79 91
80static int name_unique(unsigned int irq, struct irqaction *new_action) 92static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
118void register_irq_proc(unsigned int irq) 130void register_irq_proc(unsigned int irq)
119{ 131{
120 char name [MAX_NAMELEN]; 132 char name [MAX_NAMELEN];
133 struct proc_dir_entry *entry;
121 134
122 if (!root_irq_dir || 135 if (!root_irq_dir ||
123 (irq_desc[irq].chip == &no_irq_chip) || 136 (irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
132 145
133#ifdef CONFIG_SMP 146#ifdef CONFIG_SMP
134 { 147 {
135 struct proc_dir_entry *entry;
136
137 /* create /proc/irq/<irq>/smp_affinity */ 148 /* create /proc/irq/<irq>/smp_affinity */
138 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 149 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
139 150
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
144 } 155 }
145 } 156 }
146#endif 157#endif
158
159 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
160 if (entry) {
161 entry->data = (void *)(long)irq;
162 entry->read_proc = irq_spurious_read;
163 }
147} 164}
148 165
149#undef MAX_NAMELEN 166#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b161972fad..a6b2bc831dd0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/moduleparam.h>
13 14
14static int irqfixup __read_mostly; 15static int irqfixup __read_mostly;
15 16
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
225} 226}
226 227
227__setup("noirqdebug", noirqdebug_setup); 228__setup("noirqdebug", noirqdebug_setup);
229module_param(noirqdebug, bool, 0644);
230MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
228 231
229static int __init irqfixup_setup(char *str) 232static int __init irqfixup_setup(char *str)
230{ 233{
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
236} 239}
237 240
238__setup("irqfixup", irqfixup_setup); 241__setup("irqfixup", irqfixup_setup);
242module_param(irqfixup, int, 0644);
243MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
239 244
240static int __init irqpoll_setup(char *str) 245static int __init irqpoll_setup(char *str)
241{ 246{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d817ac9b..d0493eafea3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
824 if (!err) 824 if (!err)
825 err = register_die_notifier(&kprobe_exceptions_nb); 825 err = register_die_notifier(&kprobe_exceptions_nb);
826 826
827 if (!err)
828 init_test_probes();
827 return err; 829 return err;
828} 830}
829 831
diff --git a/kernel/module.c b/kernel/module.c
index f6a4e721fd49..bd60278ee703 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
430 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 430 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
431} 431}
432 432
433static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
434{
435 int cpu;
436
437 for_each_possible_cpu(cpu)
438 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
439}
440
433static int percpu_modinit(void) 441static int percpu_modinit(void)
434{ 442{
435 pcpu_num_used = 2; 443 pcpu_num_used = 2;
diff --git a/kernel/panic.c b/kernel/panic.c
index da4d6bac270e..d9e90cfe3298 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,6 +20,7 @@
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h>
23 24
24int panic_on_oops; 25int panic_on_oops;
25int tainted; 26int tainted;
@@ -280,6 +281,13 @@ static int init_oops_id(void)
280} 281}
281late_initcall(init_oops_id); 282late_initcall(init_oops_id);
282 283
284static void print_oops_end_marker(void)
285{
286 init_oops_id();
287 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
288 (unsigned long long)oops_id);
289}
290
283/* 291/*
284 * Called when the architecture exits its oops handler, after printing 292 * Called when the architecture exits its oops handler, after printing
285 * everything. 293 * everything.
@@ -287,11 +295,26 @@ late_initcall(init_oops_id);
287void oops_exit(void) 295void oops_exit(void)
288{ 296{
289 do_oops_enter_exit(); 297 do_oops_enter_exit();
290 init_oops_id(); 298 print_oops_end_marker();
291 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
292 (unsigned long long)oops_id);
293} 299}
294 300
301#ifdef WANT_WARN_ON_SLOWPATH
302void warn_on_slowpath(const char *file, int line)
303{
304 char function[KSYM_SYMBOL_LEN];
305 unsigned long caller = (unsigned long) __builtin_return_address(0);
306 sprint_symbol(function, caller);
307
308 printk(KERN_WARNING "------------[ cut here ]------------\n");
309 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
310 line, function);
311 print_modules();
312 dump_stack();
313 print_oops_end_marker();
314}
315EXPORT_SYMBOL(warn_on_slowpath);
316#endif
317
295#ifdef CONFIG_CC_STACKPROTECTOR 318#ifdef CONFIG_CC_STACKPROTECTOR
296/* 319/*
297 * Called when gcc's -fstack-protector feature is used, and 320 * Called when gcc's -fstack-protector feature is used, and
diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968d0ef9..58bbec684119 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
36 36
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/*
40 * Architectures can override it:
41 */
42void __attribute__((weak)) early_printk(const char *fmt, ...)
43{
44}
45
39#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 46#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
40 47
41/* printk's without a loglevel use this.. */ 48/* printk's without a loglevel use this.. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c719bb9d79ab..e6e9b8be4b05 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
366 return error; 366 return error;
367} 367}
368 368
369
370#ifdef PTRACE_SINGLESTEP
371#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
372#else
373#define is_singlestep(request) 0
374#endif
375
376#ifdef PTRACE_SINGLEBLOCK
377#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
378#else
379#define is_singleblock(request) 0
380#endif
381
382#ifdef PTRACE_SYSEMU
383#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
384#else
385#define is_sysemu_singlestep(request) 0
386#endif
387
388static int ptrace_resume(struct task_struct *child, long request, long data)
389{
390 if (!valid_signal(data))
391 return -EIO;
392
393 if (request == PTRACE_SYSCALL)
394 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
395 else
396 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
397
398#ifdef TIF_SYSCALL_EMU
399 if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
400 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
401 else
402 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
403#endif
404
405 if (is_singleblock(request)) {
406 if (unlikely(!arch_has_block_step()))
407 return -EIO;
408 user_enable_block_step(child);
409 } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
410 if (unlikely(!arch_has_single_step()))
411 return -EIO;
412 user_enable_single_step(child);
413 }
414 else
415 user_disable_single_step(child);
416
417 child->exit_code = data;
418 wake_up_process(child);
419
420 return 0;
421}
422
369int ptrace_request(struct task_struct *child, long request, 423int ptrace_request(struct task_struct *child, long request,
370 long addr, long data) 424 long addr, long data)
371{ 425{
372 int ret = -EIO; 426 int ret = -EIO;
373 427
374 switch (request) { 428 switch (request) {
429 case PTRACE_PEEKTEXT:
430 case PTRACE_PEEKDATA:
431 return generic_ptrace_peekdata(child, addr, data);
432 case PTRACE_POKETEXT:
433 case PTRACE_POKEDATA:
434 return generic_ptrace_pokedata(child, addr, data);
435
375#ifdef PTRACE_OLDSETOPTIONS 436#ifdef PTRACE_OLDSETOPTIONS
376 case PTRACE_OLDSETOPTIONS: 437 case PTRACE_OLDSETOPTIONS:
377#endif 438#endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
390 case PTRACE_DETACH: /* detach a process that was attached. */ 451 case PTRACE_DETACH: /* detach a process that was attached. */
391 ret = ptrace_detach(child, data); 452 ret = ptrace_detach(child, data);
392 break; 453 break;
454
455#ifdef PTRACE_SINGLESTEP
456 case PTRACE_SINGLESTEP:
457#endif
458#ifdef PTRACE_SINGLEBLOCK
459 case PTRACE_SINGLEBLOCK:
460#endif
461#ifdef PTRACE_SYSEMU
462 case PTRACE_SYSEMU:
463 case PTRACE_SYSEMU_SINGLESTEP:
464#endif
465 case PTRACE_SYSCALL:
466 case PTRACE_CONT:
467 return ptrace_resume(child, request, data);
468
469 case PTRACE_KILL:
470 if (child->exit_state) /* already dead */
471 return 0;
472 return ptrace_resume(child, request, SIGKILL);
473
393 default: 474 default:
394 break; 475 break;
395 } 476 }
@@ -526,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
526 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 607 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
527 return (copied == sizeof(data)) ? 0 : -EIO; 608 return (copied == sizeof(data)) ? 0 : -EIO;
528} 609}
610
611#ifdef CONFIG_COMPAT
612#include <linux/compat.h>
613
614int compat_ptrace_request(struct task_struct *child, compat_long_t request,
615 compat_ulong_t addr, compat_ulong_t data)
616{
617 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word;
619 int ret;
620
621 switch (request) {
622 case PTRACE_PEEKTEXT:
623 case PTRACE_PEEKDATA:
624 ret = access_process_vm(child, addr, &word, sizeof(word), 0);
625 if (ret != sizeof(word))
626 ret = -EIO;
627 else
628 ret = put_user(word, datap);
629 break;
630
631 case PTRACE_POKETEXT:
632 case PTRACE_POKEDATA:
633 ret = access_process_vm(child, addr, &data, sizeof(data), 1);
634 ret = (ret != sizeof(data) ? -EIO : 0);
635 break;
636
637 case PTRACE_GETEVENTMSG:
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break;
640
641 default:
642 ret = ptrace_request(child, request, addr, data);
643 }
644
645 return ret;
646}
647
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data)
651{
652 struct task_struct *child;
653 long ret;
654
655 /*
656 * This lock_kernel fixes a subtle race with suid exec
657 */
658 lock_kernel();
659 if (request == PTRACE_TRACEME) {
660 ret = ptrace_traceme();
661 goto out;
662 }
663
664 child = ptrace_get_task_struct(pid);
665 if (IS_ERR(child)) {
666 ret = PTR_ERR(child);
667 goto out;
668 }
669
670 if (request == PTRACE_ATTACH) {
671 ret = ptrace_attach(child);
672 /*
673 * Some architectures need to do book-keeping after
674 * a ptrace attach.
675 */
676 if (!ret)
677 arch_ptrace_attach(child);
678 goto out_put_task_struct;
679 }
680
681 ret = ptrace_check_attach(child, request == PTRACE_KILL);
682 if (!ret)
683 ret = compat_arch_ptrace(child, request, addr, data);
684
685 out_put_task_struct:
686 put_task_struct(child);
687 out:
688 unlock_kernel();
689 return ret;
690}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/sched.c b/kernel/sched.c
index 524285e46fa7..ba4c88088f62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,19 +4945,15 @@ EXPORT_SYMBOL(_cond_resched);
4945 */ 4945 */
4946int cond_resched_lock(spinlock_t *lock) 4946int cond_resched_lock(spinlock_t *lock)
4947{ 4947{
4948 int resched = need_resched() && system_state == SYSTEM_RUNNING;
4948 int ret = 0; 4949 int ret = 0;
4949 4950
4950 if (need_lockbreak(lock)) { 4951 if (spin_needbreak(lock) || resched) {
4951 spin_unlock(lock); 4952 spin_unlock(lock);
4952 cpu_relax(); 4953 if (resched && need_resched())
4953 ret = 1; 4954 __cond_resched();
4954 spin_lock(lock); 4955 else
4955 } 4956 cpu_relax();
4956 if (need_resched() && system_state == SYSTEM_RUNNING) {
4957 spin_release(&lock->dep_map, 1, _THIS_IP_);
4958 _raw_spin_unlock(lock);
4959 preempt_enable_no_resched();
4960 __cond_resched();
4961 ret = 1; 4957 ret = 1;
4962 spin_lock(lock); 4958 spin_lock(lock);
4963 } 4959 }
diff --git a/kernel/signal.c b/kernel/signal.c
index afa4f781f924..bf49ce6f016b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
733 current->comm, task_pid_nr(current), signr); 733 current->comm, task_pid_nr(current), signr);
734 734
735#if defined(__i386__) && !defined(__arch_um__) 735#if defined(__i386__) && !defined(__arch_um__)
736 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->ip);
737 { 737 {
738 int i; 738 int i;
739 for (i = 0; i < 16; i++) { 739 for (i = 0; i < 16; i++) {
740 unsigned char insn; 740 unsigned char insn;
741 741
742 __get_user(insn, (unsigned char *)(regs->eip + i)); 742 __get_user(insn, (unsigned char *)(regs->ip + i));
743 printk("%02x ", insn); 743 printk("%02x ", insn);
744 } 744 }
745 } 745 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..d7837d45419e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 6 * Distribute under GPLv2.
7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
7 */ 9 */
8 10
9#include <linux/module.h> 11#include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
278 */ 280 */
279void irq_enter(void) 281void irq_enter(void)
280{ 282{
283#ifdef CONFIG_NO_HZ
284 int cpu = smp_processor_id();
285 if (idle_cpu(cpu) && !in_interrupt())
286 tick_nohz_stop_idle(cpu);
287#endif
281 __irq_enter(); 288 __irq_enter();
282#ifdef CONFIG_NO_HZ 289#ifdef CONFIG_NO_HZ
283 if (idle_cpu(smp_processor_id())) 290 if (idle_cpu(cpu))
284 tick_nohz_update_jiffies(); 291 tick_nohz_update_jiffies();
285#endif 292#endif
286} 293}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424c2662..ae28c8245123 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
66 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 66 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
67 */ 67 */
68#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ 68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 defined(CONFIG_DEBUG_LOCK_ALLOC)
70 69
71void __lockfunc _read_lock(rwlock_t *lock) 70void __lockfunc _read_lock(rwlock_t *lock)
72{ 71{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4bc8e48434a7..357b68ba23ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
53#ifdef CONFIG_X86 53#ifdef CONFIG_X86
54#include <asm/nmi.h> 54#include <asm/nmi.h>
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#include <asm/io.h>
56#endif 57#endif
57 58
58static int deprecated_sysctl_warning(struct __sysctl_args *args); 59static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -727,6 +728,14 @@ static struct ctl_table kern_table[] = {
727 .mode = 0644, 728 .mode = 0644,
728 .proc_handler = &proc_dointvec, 729 .proc_handler = &proc_dointvec,
729 }, 730 },
731 {
732 .ctl_name = CTL_UNNUMBERED,
733 .procname = "io_delay_type",
734 .data = &io_delay_type,
735 .maxlen = sizeof(int),
736 .mode = 0644,
737 .proc_handler = &proc_dointvec,
738 },
730#endif 739#endif
731#if defined(CONFIG_MMU) 740#if defined(CONFIG_MMU)
732 { 741 {
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000000..88cdb109e13c
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
1/*
2 * test_kprobes.c - simple sanity test for *probes
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
16
17#include <linux/kernel.h>
18#include <linux/kprobes.h>
19#include <linux/random.h>
20
21#define div_factor 3
22
23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests;
25
26static noinline u32 kprobe_target(u32 value)
27{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor);
41}
42
43static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
44{
45 preh_val = (rand1 / div_factor);
46 return 0;
47}
48
49static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
50 unsigned long flags)
51{
52 if (preh_val != (rand1 / div_factor)) {
53 handler_errors++;
54 printk(KERN_ERR "Kprobe smoke test failed: "
55 "incorrect value in post_handler\n");
56 }
57 posth_val = preh_val + div_factor;
58}
59
60static struct kprobe kp = {
61 .symbol_name = "kprobe_target",
62 .pre_handler = kp_pre_handler,
63 .post_handler = kp_post_handler
64};
65
66static int test_kprobe(void)
67{
68 int ret;
69
70 ret = register_kprobe(&kp);
71 if (ret < 0) {
72 printk(KERN_ERR "Kprobe smoke test failed: "
73 "register_kprobe returned %d\n", ret);
74 return ret;
75 }
76
77 ret = kprobe_target(rand1);
78 unregister_kprobe(&kp);
79
80 if (preh_val == 0) {
81 printk(KERN_ERR "Kprobe smoke test failed: "
82 "kprobe pre_handler not called\n");
83 handler_errors++;
84 }
85
86 if (posth_val == 0) {
87 printk(KERN_ERR "Kprobe smoke test failed: "
88 "kprobe post_handler not called\n");
89 handler_errors++;
90 }
91
92 return 0;
93}
94
95static u32 j_kprobe_target(u32 value)
96{
97 if (value != rand1) {
98 handler_errors++;
99 printk(KERN_ERR "Kprobe smoke test failed: "
100 "incorrect value in jprobe handler\n");
101 }
102
103 jph_val = rand1;
104 jprobe_return();
105 return 0;
106}
107
108static struct jprobe jp = {
109 .entry = j_kprobe_target,
110 .kp.symbol_name = "kprobe_target"
111};
112
113static int test_jprobe(void)
114{
115 int ret;
116
117 ret = register_jprobe(&jp);
118 if (ret < 0) {
119 printk(KERN_ERR "Kprobe smoke test failed: "
120 "register_jprobe returned %d\n", ret);
121 return ret;
122 }
123
124 ret = kprobe_target(rand1);
125 unregister_jprobe(&jp);
126 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: "
128 "jprobe handler not called\n");
129 handler_errors++;
130 }
131
132 return 0;
133}
134
135#ifdef CONFIG_KRETPROBES
136static u32 krph_val;
137
138static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{
140 unsigned long ret = regs_return_value(regs);
141
142 if (ret != (rand1 / div_factor)) {
143 handler_errors++;
144 printk(KERN_ERR "Kprobe smoke test failed: "
145 "incorrect value in kretprobe handler\n");
146 }
147
148 krph_val = (rand1 / div_factor);
149 return 0;
150}
151
152static struct kretprobe rp = {
153 .handler = return_handler,
154 .kp.symbol_name = "kprobe_target"
155};
156
157static int test_kretprobe(void)
158{
159 int ret;
160
161 ret = register_kretprobe(&rp);
162 if (ret < 0) {
163 printk(KERN_ERR "Kprobe smoke test failed: "
164 "register_kretprobe returned %d\n", ret);
165 return ret;
166 }
167
168 ret = kprobe_target(rand1);
169 unregister_kretprobe(&rp);
170 if (krph_val == 0) {
171 printk(KERN_ERR "Kprobe smoke test failed: "
172 "kretprobe handler not called\n");
173 handler_errors++;
174 }
175
176 return 0;
177}
178#endif /* CONFIG_KRETPROBES */
179
180int init_test_probes(void)
181{
182 int ret;
183
184 do {
185 rand1 = random32();
186 } while (rand1 <= div_factor);
187
188 printk(KERN_INFO "Kprobe smoke test started\n");
189 num_tests++;
190 ret = test_kprobe();
191 if (ret < 0)
192 errors++;
193
194 num_tests++;
195 ret = test_jprobe();
196 if (ret < 0)
197 errors++;
198
199#ifdef CONFIG_KRETPROBES
200 num_tests++;
201 ret = test_kretprobe();
202 if (ret < 0)
203 errors++;
204#endif /* CONFIG_KRETPROBES */
205
206 if (errors)
207 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
208 "%d tests failed\n", errors, num_tests);
209 else if (handler_errors)
210 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
211 "running handlers\n", handler_errors);
212 else
213 printk(KERN_INFO "Kprobe smoke test passed successfully\n");
214
215 return 0;
216}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5fb139fef9fa..3e59fce6dd43 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
41{ 41{
42 u64 clc = ((u64) latch << evt->shift); 42 u64 clc = ((u64) latch << evt->shift);
43 43
44 if (unlikely(!evt->mult)) {
45 evt->mult = 1;
46 WARN_ON(1);
47 }
48
44 do_div(clc, evt->mult); 49 do_div(clc, evt->mult);
45 if (clc < 1000) 50 if (clc < 1000)
46 clc = 1000; 51 clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
151void clockevents_register_device(struct clock_event_device *dev) 156void clockevents_register_device(struct clock_event_device *dev)
152{ 157{
153 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159 /*
160 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
161 * on it, so fix it up and emit a warning:
162 */
163 if (unlikely(!dev->mult)) {
164 dev->mult = 1;
165 WARN_ON(1);
166 }
154 167
155 spin_lock(&clockevents_lock); 168 spin_lock(&clockevents_lock);
156 169
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 8d6125ad2cf0..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
142 } 142 }
143 143
144 if (!list_empty(&watchdog_list)) { 144 if (!list_empty(&watchdog_list)) {
145 __mod_timer(&watchdog_timer, 145 /* Cycle through CPUs to check if the CPUs stay synchronized to
146 watchdog_timer.expires + WATCHDOG_INTERVAL); 146 * each other. */
147 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
148 if (next_cpu >= NR_CPUS)
149 next_cpu = first_cpu(cpu_online_map);
150 watchdog_timer.expires += WATCHDOG_INTERVAL;
151 add_timer_on(&watchdog_timer, next_cpu);
147 } 152 }
148 spin_unlock(&watchdog_lock); 153 spin_unlock(&watchdog_lock);
149} 154}
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
165 if (!started && watchdog) { 170 if (!started && watchdog) {
166 watchdog_last = watchdog->read(); 171 watchdog_last = watchdog->read();
167 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 172 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
168 add_timer(&watchdog_timer); 173 add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
169 } 174 }
170 } else { 175 } else {
171 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 176 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
175 if (watchdog) 180 if (watchdog)
176 del_timer(&watchdog_timer); 181 del_timer(&watchdog_timer);
177 watchdog = cs; 182 watchdog = cs;
178 init_timer(&watchdog_timer); 183 init_timer_deferrable(&watchdog_timer);
179 watchdog_timer.function = clocksource_watchdog; 184 watchdog_timer.function = clocksource_watchdog;
180 185
181 /* Reset watchdog cycles */ 186 /* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
186 watchdog_last = watchdog->read(); 191 watchdog_last = watchdog->read();
187 watchdog_timer.expires = 192 watchdog_timer.expires =
188 jiffies + WATCHDOG_INTERVAL; 193 jiffies + WATCHDOG_INTERVAL;
189 add_timer(&watchdog_timer); 194 add_timer_on(&watchdog_timer,
195 first_cpu(cpu_online_map));
190 } 196 }
191 } 197 }
192 } 198 }
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
331 spin_unlock_irqrestore(&clocksource_lock, flags); 337 spin_unlock_irqrestore(&clocksource_lock, flags);
332} 338}
333 339
340/**
341 * clocksource_unregister - remove a registered clocksource
342 */
343void clocksource_unregister(struct clocksource *cs)
344{
345 unsigned long flags;
346
347 spin_lock_irqsave(&clocksource_lock, flags);
348 list_del(&cs->list);
349 if (clocksource_override == cs)
350 clocksource_override = NULL;
351 next_clocksource = select_clocksource();
352 spin_unlock_irqrestore(&clocksource_lock, flags);
353}
354
334#ifdef CONFIG_SYSFS 355#ifdef CONFIG_SYSFS
335/** 356/**
336 * sysfs_show_current_clocksources - sysfs interface for current clocksource 357 * sysfs_show_current_clocksources - sysfs interface for current clocksource
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5b86698faa0b..e1bd50cbbf5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
126/* 126/*
127 * Broadcast the event to the cpus, which are set in the mask 127 * Broadcast the event to the cpus, which are set in the mask
128 */ 128 */
129int tick_do_broadcast(cpumask_t mask) 129static void tick_do_broadcast(cpumask_t mask)
130{ 130{
131 int ret = 0, cpu = smp_processor_id(); 131 int cpu = smp_processor_id();
132 struct tick_device *td; 132 struct tick_device *td;
133 133
134 /* 134 /*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
138 cpu_clear(cpu, mask); 138 cpu_clear(cpu, mask);
139 td = &per_cpu(tick_cpu_device, cpu); 139 td = &per_cpu(tick_cpu_device, cpu);
140 td->evtdev->event_handler(td->evtdev); 140 td->evtdev->event_handler(td->evtdev);
141 ret = 1;
142 } 141 }
143 142
144 if (!cpus_empty(mask)) { 143 if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
151 cpu = first_cpu(mask); 150 cpu = first_cpu(mask);
152 td = &per_cpu(tick_cpu_device, cpu); 151 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 152 td->evtdev->broadcast(mask);
154 ret = 1;
155 } 153 }
156 return ret;
157} 154}
158 155
159/* 156/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f2724905..f13f2b7f4fd4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
70 * Broadcasting support 70 * Broadcasting support
71 */ 71 */
72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
73extern int tick_do_broadcast(cpumask_t mask);
74
75extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 73extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
76extern int tick_check_broadcast_device(struct clock_event_device *dev); 74extern int tick_check_broadcast_device(struct clock_event_device *dev);
77extern int tick_is_broadcast_device(struct clock_event_device *dev); 75extern int tick_is_broadcast_device(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1a21b6fdb674..63f24b550695 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * For licencing details see kernel-base/COPYING 12 * Distribute under GPLv2.
13 */ 13 */
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
143 local_irq_restore(flags); 143 local_irq_restore(flags);
144} 144}
145 145
146void tick_nohz_stop_idle(int cpu)
147{
148 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
149
150 if (ts->idle_active) {
151 ktime_t now, delta;
152 now = ktime_get();
153 delta = ktime_sub(now, ts->idle_entrytime);
154 ts->idle_lastupdate = now;
155 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
156 ts->idle_active = 0;
157 }
158}
159
160static ktime_t tick_nohz_start_idle(int cpu)
161{
162 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
163 ktime_t now, delta;
164
165 now = ktime_get();
166 if (ts->idle_active) {
167 delta = ktime_sub(now, ts->idle_entrytime);
168 ts->idle_lastupdate = now;
169 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
170 }
171 ts->idle_entrytime = now;
172 ts->idle_active = 1;
173 return now;
174}
175
176u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
177{
178 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
179
180 *last_update_time = ktime_to_us(ts->idle_lastupdate);
181 return ktime_to_us(ts->idle_sleeptime);
182}
183
146/** 184/**
147 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 185 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
148 * 186 *
@@ -155,13 +193,14 @@ void tick_nohz_stop_sched_tick(void)
155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
156 unsigned long rt_jiffies; 194 unsigned long rt_jiffies;
157 struct tick_sched *ts; 195 struct tick_sched *ts;
158 ktime_t last_update, expires, now, delta; 196 ktime_t last_update, expires, now;
159 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 197 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
160 int cpu; 198 int cpu;
161 199
162 local_irq_save(flags); 200 local_irq_save(flags);
163 201
164 cpu = smp_processor_id(); 202 cpu = smp_processor_id();
203 now = tick_nohz_start_idle(cpu);
165 ts = &per_cpu(tick_cpu_sched, cpu); 204 ts = &per_cpu(tick_cpu_sched, cpu);
166 205
167 /* 206 /*
@@ -193,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
193 } 232 }
194 } 233 }
195 234
196 now = ktime_get();
197 /*
198 * When called from irq_exit we need to account the idle sleep time
199 * correctly.
200 */
201 if (ts->tick_stopped) {
202 delta = ktime_sub(now, ts->idle_entrytime);
203 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
204 }
205
206 ts->idle_entrytime = now;
207 ts->idle_calls++; 235 ts->idle_calls++;
208
209 /* Read jiffies and the time when jiffies were updated last */ 236 /* Read jiffies and the time when jiffies were updated last */
210 do { 237 do {
211 seq = read_seqbegin(&xtime_lock); 238 seq = read_seqbegin(&xtime_lock);
@@ -296,7 +323,7 @@ void tick_nohz_stop_sched_tick(void)
296 /* Check, if the timer was already in the past */ 323 /* Check, if the timer was already in the past */
297 if (hrtimer_active(&ts->sched_timer)) 324 if (hrtimer_active(&ts->sched_timer))
298 goto out; 325 goto out;
299 } else if(!tick_program_event(expires, 0)) 326 } else if (!tick_program_event(expires, 0))
300 goto out; 327 goto out;
301 /* 328 /*
302 * We are past the event already. So we crossed a 329 * We are past the event already. So we crossed a
@@ -337,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
337 int cpu = smp_processor_id(); 364 int cpu = smp_processor_id();
338 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 365 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
339 unsigned long ticks; 366 unsigned long ticks;
340 ktime_t now, delta; 367 ktime_t now;
341 368
342 if (!ts->tick_stopped) 369 local_irq_disable();
370 tick_nohz_stop_idle(cpu);
371
372 if (!ts->tick_stopped) {
373 local_irq_enable();
343 return; 374 return;
375 }
344 376
345 /* Update jiffies first */ 377 /* Update jiffies first */
346 now = ktime_get();
347
348 local_irq_disable();
349 select_nohz_load_balancer(0); 378 select_nohz_load_balancer(0);
379 now = ktime_get();
350 tick_do_update_jiffies64(now); 380 tick_do_update_jiffies64(now);
351 cpu_clear(cpu, nohz_cpu_mask); 381 cpu_clear(cpu, nohz_cpu_mask);
352 382
353 /* Account the idle time */
354 delta = ktime_sub(now, ts->idle_entrytime);
355 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
356
357 /* 383 /*
358 * We stopped the tick in idle. Update process times would miss the 384 * We stopped the tick in idle. Update process times would miss the
359 * time we slept as update_process_times does only a 1 tick 385 * time we slept as update_process_times does only a 1 tick
@@ -507,7 +533,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
507 */ 533 */
508#ifdef CONFIG_HIGH_RES_TIMERS 534#ifdef CONFIG_HIGH_RES_TIMERS
509/* 535/*
510 * We rearm the timer until we get disabled by the idle code 536 * We rearm the timer until we get disabled by the idle code.
511 * Called with interrupts disabled and timer->base->cpu_base->lock held. 537 * Called with interrupts disabled and timer->base->cpu_base->lock held.
512 */ 538 */
513static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 539static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ab46ae8c062b..092a2366b5a9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
82} 82}
83 83
84/** 84/**
85 * __get_realtime_clock_ts - Returns the time of day in a timespec 85 * getnstimeofday - Returns the time of day in a timespec
86 * @ts: pointer to the timespec to be set 86 * @ts: pointer to the timespec to be set
87 * 87 *
88 * Returns the time of day in a timespec. Used by 88 * Returns the time of day in a timespec.
89 * do_gettimeofday() and get_realtime_clock_ts().
90 */ 89 */
91static inline void __get_realtime_clock_ts(struct timespec *ts) 90void getnstimeofday(struct timespec *ts)
92{ 91{
93 unsigned long seq; 92 unsigned long seq;
94 s64 nsecs; 93 s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
104 timespec_add_ns(ts, nsecs); 103 timespec_add_ns(ts, nsecs);
105} 104}
106 105
107/**
108 * getnstimeofday - Returns the time of day in a timespec
109 * @ts: pointer to the timespec to be set
110 *
111 * Returns the time of day in a timespec.
112 */
113void getnstimeofday(struct timespec *ts)
114{
115 __get_realtime_clock_ts(ts);
116}
117
118EXPORT_SYMBOL(getnstimeofday); 106EXPORT_SYMBOL(getnstimeofday);
119 107
120/** 108/**
121 * do_gettimeofday - Returns the time of day in a timeval 109 * do_gettimeofday - Returns the time of day in a timeval
122 * @tv: pointer to the timeval to be set 110 * @tv: pointer to the timeval to be set
123 * 111 *
124 * NOTE: Users should be converted to using get_realtime_clock_ts() 112 * NOTE: Users should be converted to using getnstimeofday()
125 */ 113 */
126void do_gettimeofday(struct timeval *tv) 114void do_gettimeofday(struct timeval *tv)
127{ 115{
128 struct timespec now; 116 struct timespec now;
129 117
130 __get_realtime_clock_ts(&now); 118 getnstimeofday(&now);
131 tv->tv_sec = now.tv_sec; 119 tv->tv_sec = now.tv_sec;
132 tv->tv_usec = now.tv_nsec/1000; 120 tv->tv_usec = now.tv_nsec/1000;
133} 121}
@@ -198,7 +186,8 @@ static void change_clocksource(void)
198 186
199 clock->error = 0; 187 clock->error = 0;
200 clock->xtime_nsec = 0; 188 clock->xtime_nsec = 0;
201 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 189 clocksource_calculate_interval(clock,
190 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
202 191
203 tick_clock_notify(); 192 tick_clock_notify();
204 193
@@ -255,7 +244,8 @@ void __init timekeeping_init(void)
255 ntp_clear(); 244 ntp_clear();
256 245
257 clock = clocksource_get_next(); 246 clock = clocksource_get_next();
258 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 247 clocksource_calculate_interval(clock,
248 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
259 clock->cycle_last = clocksource_read(clock); 249 clock->cycle_last = clocksource_read(clock);
260 250
261 xtime.tv_sec = sec; 251 xtime.tv_sec = sec;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
26 * the pid and cmdline from the owner process if applicable. 26 * the pid and cmdline from the owner process if applicable.
27 * 27 *
28 * Start/stop data collection: 28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats 29 * # echo [1|0] >/proc/timer_stats
30 * 30 *
31 * Display the information collected so far: 31 * Display the information collected so far:
32 * # cat /proc/timer_stats 32 * # cat /proc/timer_stats
diff --git a/kernel/timer.c b/kernel/timer.c
index f739dfb539ce..23f7ead78fae 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
58#define TVN_MASK (TVN_SIZE - 1) 58#define TVN_MASK (TVN_SIZE - 1)
59#define TVR_MASK (TVR_SIZE - 1) 59#define TVR_MASK (TVR_SIZE - 1)
60 60
61typedef struct tvec_s { 61struct tvec {
62 struct list_head vec[TVN_SIZE]; 62 struct list_head vec[TVN_SIZE];
63} tvec_t; 63};
64 64
65typedef struct tvec_root_s { 65struct tvec_root {
66 struct list_head vec[TVR_SIZE]; 66 struct list_head vec[TVR_SIZE];
67} tvec_root_t; 67};
68 68
69struct tvec_t_base_s { 69struct tvec_base {
70 spinlock_t lock; 70 spinlock_t lock;
71 struct timer_list *running_timer; 71 struct timer_list *running_timer;
72 unsigned long timer_jiffies; 72 unsigned long timer_jiffies;
73 tvec_root_t tv1; 73 struct tvec_root tv1;
74 tvec_t tv2; 74 struct tvec tv2;
75 tvec_t tv3; 75 struct tvec tv3;
76 tvec_t tv4; 76 struct tvec tv4;
77 tvec_t tv5; 77 struct tvec tv5;
78} ____cacheline_aligned; 78} ____cacheline_aligned;
79 79
80typedef struct tvec_t_base_s tvec_base_t; 80struct tvec_base boot_tvec_bases;
81
82tvec_base_t boot_tvec_bases;
83EXPORT_SYMBOL(boot_tvec_bases); 81EXPORT_SYMBOL(boot_tvec_bases);
84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 82static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
85 83
86/* 84/*
87 * Note that all tvec_bases is 2 byte aligned and lower bit of 85 * Note that all tvec_bases are 2 byte aligned and lower bit of
88 * base in timer_list is guaranteed to be zero. Use the LSB for 86 * base in timer_list is guaranteed to be zero. Use the LSB for
89 * the new flag to indicate whether the timer is deferrable 87 * the new flag to indicate whether the timer is deferrable
90 */ 88 */
91#define TBASE_DEFERRABLE_FLAG (0x1) 89#define TBASE_DEFERRABLE_FLAG (0x1)
92 90
93/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
94static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
95{ 93{
96 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 94 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
97} 95}
98 96
99static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 97static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
100{ 98{
101 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 99 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
102} 100}
103 101
104static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
105{ 103{
106 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 104 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
107 TBASE_DEFERRABLE_FLAG)); 105 TBASE_DEFERRABLE_FLAG));
108} 106}
109 107
110static inline void 108static inline void
111timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 109timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112{ 110{
113 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 111 timer->base = (struct tvec_base *)((unsigned long)(new_base) |
114 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
115} 113}
116 114
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
246EXPORT_SYMBOL_GPL(round_jiffies_relative); 244EXPORT_SYMBOL_GPL(round_jiffies_relative);
247 245
248 246
249static inline void set_running_timer(tvec_base_t *base, 247static inline void set_running_timer(struct tvec_base *base,
250 struct timer_list *timer) 248 struct timer_list *timer)
251{ 249{
252#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
254#endif 252#endif
255} 253}
256 254
257static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 255static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
258{ 256{
259 unsigned long expires = timer->expires; 257 unsigned long expires = timer->expires;
260 unsigned long idx = expires - base->timer_jiffies; 258 unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
371 * possible to set timer->base = NULL and drop the lock: the timer remains 369 * possible to set timer->base = NULL and drop the lock: the timer remains
372 * locked. 370 * locked.
373 */ 371 */
374static tvec_base_t *lock_timer_base(struct timer_list *timer, 372static struct tvec_base *lock_timer_base(struct timer_list *timer,
375 unsigned long *flags) 373 unsigned long *flags)
376 __acquires(timer->base->lock) 374 __acquires(timer->base->lock)
377{ 375{
378 tvec_base_t *base; 376 struct tvec_base *base;
379 377
380 for (;;) { 378 for (;;) {
381 tvec_base_t *prelock_base = timer->base; 379 struct tvec_base *prelock_base = timer->base;
382 base = tbase_get_base(prelock_base); 380 base = tbase_get_base(prelock_base);
383 if (likely(base != NULL)) { 381 if (likely(base != NULL)) {
384 spin_lock_irqsave(&base->lock, *flags); 382 spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
393 391
394int __mod_timer(struct timer_list *timer, unsigned long expires) 392int __mod_timer(struct timer_list *timer, unsigned long expires)
395{ 393{
396 tvec_base_t *base, *new_base; 394 struct tvec_base *base, *new_base;
397 unsigned long flags; 395 unsigned long flags;
398 int ret = 0; 396 int ret = 0;
399 397
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
445 */ 443 */
446void add_timer_on(struct timer_list *timer, int cpu) 444void add_timer_on(struct timer_list *timer, int cpu)
447{ 445{
448 tvec_base_t *base = per_cpu(tvec_bases, cpu); 446 struct tvec_base *base = per_cpu(tvec_bases, cpu);
449 unsigned long flags; 447 unsigned long flags;
450 448
451 timer_stats_timer_set_start_info(timer); 449 timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
508 */ 506 */
509int del_timer(struct timer_list *timer) 507int del_timer(struct timer_list *timer)
510{ 508{
511 tvec_base_t *base; 509 struct tvec_base *base;
512 unsigned long flags; 510 unsigned long flags;
513 int ret = 0; 511 int ret = 0;
514 512
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
539 */ 537 */
540int try_to_del_timer_sync(struct timer_list *timer) 538int try_to_del_timer_sync(struct timer_list *timer)
541{ 539{
542 tvec_base_t *base; 540 struct tvec_base *base;
543 unsigned long flags; 541 unsigned long flags;
544 int ret = -1; 542 int ret = -1;
545 543
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
591EXPORT_SYMBOL(del_timer_sync); 589EXPORT_SYMBOL(del_timer_sync);
592#endif 590#endif
593 591
594static int cascade(tvec_base_t *base, tvec_t *tv, int index) 592static int cascade(struct tvec_base *base, struct tvec *tv, int index)
595{ 593{
596 /* cascade all the timers from tv up one level */ 594 /* cascade all the timers from tv up one level */
597 struct timer_list *timer, *tmp; 595 struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
620 * This function cascades all vectors and executes all expired timer 618 * This function cascades all vectors and executes all expired timer
621 * vectors. 619 * vectors.
622 */ 620 */
623static inline void __run_timers(tvec_base_t *base) 621static inline void __run_timers(struct tvec_base *base)
624{ 622{
625 struct timer_list *timer; 623 struct timer_list *timer;
626 624
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
657 int preempt_count = preempt_count(); 655 int preempt_count = preempt_count();
658 fn(data); 656 fn(data);
659 if (preempt_count != preempt_count()) { 657 if (preempt_count != preempt_count()) {
660 printk(KERN_WARNING "huh, entered %p " 658 printk(KERN_ERR "huh, entered %p "
661 "with preempt_count %08x, exited" 659 "with preempt_count %08x, exited"
662 " with %08x?\n", 660 " with %08x?\n",
663 fn, preempt_count, 661 fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
678 * is used on S/390 to stop all activity when a cpus is idle. 676 * is used on S/390 to stop all activity when a cpus is idle.
679 * This functions needs to be called disabled. 677 * This functions needs to be called disabled.
680 */ 678 */
681static unsigned long __next_timer_interrupt(tvec_base_t *base) 679static unsigned long __next_timer_interrupt(struct tvec_base *base)
682{ 680{
683 unsigned long timer_jiffies = base->timer_jiffies; 681 unsigned long timer_jiffies = base->timer_jiffies;
684 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; 682 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
685 int index, slot, array, found = 0; 683 int index, slot, array, found = 0;
686 struct timer_list *nte; 684 struct timer_list *nte;
687 tvec_t *varray[4]; 685 struct tvec *varray[4];
688 686
689 /* Look for timer events in tv1. */ 687 /* Look for timer events in tv1. */
690 index = slot = timer_jiffies & TVR_MASK; 688 index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
716 varray[3] = &base->tv5; 714 varray[3] = &base->tv5;
717 715
718 for (array = 0; array < 4; array++) { 716 for (array = 0; array < 4; array++) {
719 tvec_t *varp = varray[array]; 717 struct tvec *varp = varray[array];
720 718
721 index = slot = timer_jiffies & TVN_MASK; 719 index = slot = timer_jiffies & TVN_MASK;
722 do { 720 do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
795 */ 793 */
796unsigned long get_next_timer_interrupt(unsigned long now) 794unsigned long get_next_timer_interrupt(unsigned long now)
797{ 795{
798 tvec_base_t *base = __get_cpu_var(tvec_bases); 796 struct tvec_base *base = __get_cpu_var(tvec_bases);
799 unsigned long expires; 797 unsigned long expires;
800 798
801 spin_lock(&base->lock); 799 spin_lock(&base->lock);
@@ -894,7 +892,7 @@ static inline void calc_load(unsigned long ticks)
894 */ 892 */
895static void run_timer_softirq(struct softirq_action *h) 893static void run_timer_softirq(struct softirq_action *h)
896{ 894{
897 tvec_base_t *base = __get_cpu_var(tvec_bases); 895 struct tvec_base *base = __get_cpu_var(tvec_bases);
898 896
899 hrtimer_run_pending(); 897 hrtimer_run_pending();
900 898
@@ -1223,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
1223static int __cpuinit init_timers_cpu(int cpu) 1221static int __cpuinit init_timers_cpu(int cpu)
1224{ 1222{
1225 int j; 1223 int j;
1226 tvec_base_t *base; 1224 struct tvec_base *base;
1227 static char __cpuinitdata tvec_base_done[NR_CPUS]; 1225 static char __cpuinitdata tvec_base_done[NR_CPUS];
1228 1226
1229 if (!tvec_base_done[cpu]) { 1227 if (!tvec_base_done[cpu]) {
@@ -1278,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1278} 1276}
1279 1277
1280#ifdef CONFIG_HOTPLUG_CPU 1278#ifdef CONFIG_HOTPLUG_CPU
1281static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1279static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1282{ 1280{
1283 struct timer_list *timer; 1281 struct timer_list *timer;
1284 1282
@@ -1292,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1292 1290
1293static void __cpuinit migrate_timers(int cpu) 1291static void __cpuinit migrate_timers(int cpu)
1294{ 1292{
1295 tvec_base_t *old_base; 1293 struct tvec_base *old_base;
1296 tvec_base_t *new_base; 1294 struct tvec_base *new_base;
1297 int i; 1295 int i;
1298 1296
1299 BUG_ON(cpu_online(cpu)); 1297 BUG_ON(cpu_online(cpu));
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c4ecb2994ba3..89f4035b526c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -494,6 +494,30 @@ config RCU_TORTURE_TEST
494 Say M if you want the RCU torture tests to build as a module. 494 Say M if you want the RCU torture tests to build as a module.
495 Say N if you are unsure. 495 Say N if you are unsure.
496 496
497config KPROBES_SANITY_TEST
498 bool "Kprobes sanity tests"
499 depends on DEBUG_KERNEL
500 depends on KPROBES
501 default n
502 help
503 This option provides for testing basic kprobes functionality on
504 boot. A sample kprobe, jprobe and kretprobe are inserted and
505 verified for functionality.
506
507 Say N if you are unsure.
508
509config BACKTRACE_SELF_TEST
510 tristate "Self test for the backtrace code"
511 depends on DEBUG_KERNEL
512 default n
513 help
514 This option provides a kernel module that can be used to test
515 the kernel stack backtrace code. This option is not useful
516 for distributions or general kernels, but only for kernel
517 developers working on architecture code.
518
519 Say N if you are unsure.
520
497config LKDTM 521config LKDTM
498 tristate "Linux Kernel Dump Test Tool Module" 522 tristate "Linux Kernel Dump Test Tool Module"
499 depends on DEBUG_KERNEL 523 depends on DEBUG_KERNEL
@@ -562,5 +586,33 @@ config LATENCYTOP
562 Enable this option if you want to use the LatencyTOP tool 586 Enable this option if you want to use the LatencyTOP tool
563 to find out which userspace is blocking on what kernel operations. 587 to find out which userspace is blocking on what kernel operations.
564 588
589config PROVIDE_OHCI1394_DMA_INIT
590 bool "Provide code for enabling DMA over FireWire early on boot"
591 depends on PCI && X86
592 help
593 If you want to debug problems which hang or crash the kernel early
594 on boot and the crashing machine has a FireWire port, you can use
595 this feature to remotely access the memory of the crashed machine
596 over FireWire. This employs remote DMA as part of the OHCI1394
597 specification which is now the standard for FireWire controllers.
598
599 With remote DMA, you can monitor the printk buffer remotely using
600 firescope and access all memory below 4GB using fireproxy from gdb.
601 Even controlling a kernel debugger is possible using remote DMA.
602
603 Usage:
604
605 If ohci1394_dma=early is used as boot parameter, it will initialize
606 all OHCI1394 controllers which are found in the PCI config space.
607
608 As all changes to the FireWire bus such as enabling and disabling
609 devices cause a bus reset and thereby disable remote DMA for all
610 devices, be sure to have the cable plugged and FireWire enabled on
611 the debugging host before booting the debug target for debugging.
612
613 This code (~1k) is freed after boot. By then, the firewire stack
614 in charge of the OHCI-1394 controllers should be used instead.
615
616 See Documentation/debugging-via-ohci1394.txt for more information.
565 617
566source "samples/Kconfig" 618source "samples/Kconfig"
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 7d02700a4b0e..3e3365e5665e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -187,7 +187,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
187/* 187/*
188 * wait for the read lock to be granted 188 * wait for the read lock to be granted
189 */ 189 */
190struct rw_semaphore fastcall __sched * 190asmregparm struct rw_semaphore __sched *
191rwsem_down_read_failed(struct rw_semaphore *sem) 191rwsem_down_read_failed(struct rw_semaphore *sem)
192{ 192{
193 struct rwsem_waiter waiter; 193 struct rwsem_waiter waiter;
@@ -201,7 +201,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
201/* 201/*
202 * wait for the write lock to be granted 202 * wait for the write lock to be granted
203 */ 203 */
204struct rw_semaphore fastcall __sched * 204asmregparm struct rw_semaphore __sched *
205rwsem_down_write_failed(struct rw_semaphore *sem) 205rwsem_down_write_failed(struct rw_semaphore *sem)
206{ 206{
207 struct rwsem_waiter waiter; 207 struct rwsem_waiter waiter;
@@ -216,7 +216,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
216 * handle waking up a waiter on the semaphore 216 * handle waking up a waiter on the semaphore
217 * - up_read/up_write has decremented the active part of count if we come here 217 * - up_read/up_write has decremented the active part of count if we come here
218 */ 218 */
219struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem) 219asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
220{ 220{
221 unsigned long flags; 221 unsigned long flags;
222 222
@@ -236,7 +236,7 @@ struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem)
236 * - caller incremented waiting part of count and discovered it still negative 236 * - caller incremented waiting part of count and discovered it still negative
237 * - just wake up any readers at the front of the queue 237 * - just wake up any readers at the front of the queue
238 */ 238 */
239struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem) 239asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
240{ 240{
241 unsigned long flags; 241 unsigned long flags;
242 242
diff --git a/mm/memory.c b/mm/memory.c
index 4b0144b24c12..d902d0e25edc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -513,8 +513,7 @@ again:
513 if (progress >= 32) { 513 if (progress >= 32) {
514 progress = 0; 514 progress = 0;
515 if (need_resched() || 515 if (need_resched() ||
516 need_lockbreak(src_ptl) || 516 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
517 need_lockbreak(dst_ptl))
518 break; 517 break;
519 } 518 }
520 if (pte_none(*src_pte)) { 519 if (pte_none(*src_pte)) {
@@ -853,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
853 tlb_finish_mmu(*tlbp, tlb_start, start); 852 tlb_finish_mmu(*tlbp, tlb_start, start);
854 853
855 if (need_resched() || 854 if (need_resched() ||
856 (i_mmap_lock && need_lockbreak(i_mmap_lock))) { 855 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
857 if (i_mmap_lock) { 856 if (i_mmap_lock) {
858 *tlbp = NULL; 857 *tlbp = NULL;
859 goto out; 858 goto out;
@@ -1768,8 +1767,7 @@ again:
1768 1767
1769 restart_addr = zap_page_range(vma, start_addr, 1768 restart_addr = zap_page_range(vma, start_addr,
1770 end_addr - start_addr, details); 1769 end_addr - start_addr, details);
1771 need_break = need_resched() || 1770 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
1772 need_lockbreak(details->i_mmap_lock);
1773 1771
1774 if (restart_addr >= end_addr) { 1772 if (restart_addr >= end_addr) {
1775 /* We have now completed this vma: mark it so */ 1773 /* We have now completed this vma: mark it so */
@@ -2756,3 +2754,34 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2756 2754
2757 return buf - old_buf; 2755 return buf - old_buf;
2758} 2756}
2757
2758/*
2759 * Print the name of a VMA.
2760 */
2761void print_vma_addr(char *prefix, unsigned long ip)
2762{
2763 struct mm_struct *mm = current->mm;
2764 struct vm_area_struct *vma;
2765
2766 down_read(&mm->mmap_sem);
2767 vma = find_vma(mm, ip);
2768 if (vma && vma->vm_file) {
2769 struct file *f = vma->vm_file;
2770 char *buf = (char *)__get_free_page(GFP_KERNEL);
2771 if (buf) {
2772 char *p, *s;
2773
2774 p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
2775 if (IS_ERR(p))
2776 p = "?";
2777 s = strrchr(p, '/');
2778 if (s)
2779 p = s+1;
2780 printk("%s%s[%lx+%lx]", prefix, p,
2781 vma->vm_start,
2782 vma->vm_end - vma->vm_start);
2783 free_page((unsigned long)buf);
2784 }
2785 }
2786 up_read(&current->mm->mmap_sem);
2787}
diff --git a/mm/mmap.c b/mm/mmap.c
index bfa389fc6ded..d2b6d44962b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -251,7 +251,8 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
251 * not page aligned -Ram Gupta 251 * not page aligned -Ram Gupta
252 */ 252 */
253 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 253 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
254 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) 254 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
255 (mm->end_data - mm->start_data) > rlim)
255 goto out; 256 goto out;
256 257
257 newbrk = PAGE_ALIGN(brk); 258 newbrk = PAGE_ALIGN(brk);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index d4dc4eb48d95..a2241060113b 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -348,6 +348,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
348 atomic_inc(&entry->lsm_data->refcount); 348 atomic_inc(&entry->lsm_data->refcount);
349 secattr->cache = entry->lsm_data; 349 secattr->cache = entry->lsm_data;
350 secattr->flags |= NETLBL_SECATTR_CACHE; 350 secattr->flags |= NETLBL_SECATTR_CACHE;
351 secattr->type = NETLBL_NLTYPE_CIPSOV4;
351 if (prev_entry == NULL) { 352 if (prev_entry == NULL) {
352 spin_unlock_bh(&cipso_v4_cache[bkt].lock); 353 spin_unlock_bh(&cipso_v4_cache[bkt].lock);
353 return 0; 354 return 0;
@@ -865,7 +866,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
865 } 866 }
866 867
867 for (;;) { 868 for (;;) {
868 host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat, 869 host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
869 host_spot + 1); 870 host_spot + 1);
870 if (host_spot < 0) 871 if (host_spot < 0)
871 break; 872 break;
@@ -948,7 +949,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
948 return -EPERM; 949 return -EPERM;
949 break; 950 break;
950 } 951 }
951 ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, 952 ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
952 host_spot, 953 host_spot,
953 GFP_ATOMIC); 954 GFP_ATOMIC);
954 if (ret_val != 0) 955 if (ret_val != 0)
@@ -1014,7 +1015,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
1014 u32 cat_iter = 0; 1015 u32 cat_iter = 0;
1015 1016
1016 for (;;) { 1017 for (;;) {
1017 cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1); 1018 cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
1019 cat + 1);
1018 if (cat < 0) 1020 if (cat < 0)
1019 break; 1021 break;
1020 if ((cat_iter + 2) > net_cat_len) 1022 if ((cat_iter + 2) > net_cat_len)
@@ -1049,7 +1051,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
1049 u32 iter; 1051 u32 iter;
1050 1052
1051 for (iter = 0; iter < net_cat_len; iter += 2) { 1053 for (iter = 0; iter < net_cat_len; iter += 2) {
1052 ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, 1054 ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
1053 ntohs(get_unaligned((__be16 *)&net_cat[iter])), 1055 ntohs(get_unaligned((__be16 *)&net_cat[iter])),
1054 GFP_ATOMIC); 1056 GFP_ATOMIC);
1055 if (ret_val != 0) 1057 if (ret_val != 0)
@@ -1130,7 +1132,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
1130 return -ENOSPC; 1132 return -ENOSPC;
1131 1133
1132 for (;;) { 1134 for (;;) {
1133 iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); 1135 iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
1136 iter + 1);
1134 if (iter < 0) 1137 if (iter < 0)
1135 break; 1138 break;
1136 cat_size += (iter == 0 ? 0 : sizeof(u16)); 1139 cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1138,7 +1141,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
1138 return -ENOSPC; 1141 return -ENOSPC;
1139 array[array_cnt++] = iter; 1142 array[array_cnt++] = iter;
1140 1143
1141 iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter); 1144 iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
1145 iter);
1142 if (iter < 0) 1146 if (iter < 0)
1143 return -EFAULT; 1147 return -EFAULT;
1144 cat_size += sizeof(u16); 1148 cat_size += sizeof(u16);
@@ -1191,7 +1195,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
1191 else 1195 else
1192 cat_low = 0; 1196 cat_low = 0;
1193 1197
1194 ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat, 1198 ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
1195 cat_low, 1199 cat_low,
1196 cat_high, 1200 cat_high,
1197 GFP_ATOMIC); 1201 GFP_ATOMIC);
@@ -1251,7 +1255,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
1251 if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) 1255 if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
1252 return -EPERM; 1256 return -EPERM;
1253 1257
1254 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1258 ret_val = cipso_v4_map_lvl_hton(doi_def,
1259 secattr->attr.mls.lvl,
1260 &level);
1255 if (ret_val != 0) 1261 if (ret_val != 0)
1256 return ret_val; 1262 return ret_val;
1257 1263
@@ -1303,12 +1309,13 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
1303 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1309 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1304 if (ret_val != 0) 1310 if (ret_val != 0)
1305 return ret_val; 1311 return ret_val;
1306 secattr->mls_lvl = level; 1312 secattr->attr.mls.lvl = level;
1307 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1313 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1308 1314
1309 if (tag_len > 4) { 1315 if (tag_len > 4) {
1310 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1316 secattr->attr.mls.cat =
1311 if (secattr->mls_cat == NULL) 1317 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1318 if (secattr->attr.mls.cat == NULL)
1312 return -ENOMEM; 1319 return -ENOMEM;
1313 1320
1314 ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, 1321 ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1316,7 +1323,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
1316 tag_len - 4, 1323 tag_len - 4,
1317 secattr); 1324 secattr);
1318 if (ret_val != 0) { 1325 if (ret_val != 0) {
1319 netlbl_secattr_catmap_free(secattr->mls_cat); 1326 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1320 return ret_val; 1327 return ret_val;
1321 } 1328 }
1322 1329
@@ -1350,7 +1357,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
1350 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) 1357 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
1351 return -EPERM; 1358 return -EPERM;
1352 1359
1353 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1360 ret_val = cipso_v4_map_lvl_hton(doi_def,
1361 secattr->attr.mls.lvl,
1362 &level);
1354 if (ret_val != 0) 1363 if (ret_val != 0)
1355 return ret_val; 1364 return ret_val;
1356 1365
@@ -1396,12 +1405,13 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
1396 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1405 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1397 if (ret_val != 0) 1406 if (ret_val != 0)
1398 return ret_val; 1407 return ret_val;
1399 secattr->mls_lvl = level; 1408 secattr->attr.mls.lvl = level;
1400 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1409 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1401 1410
1402 if (tag_len > 4) { 1411 if (tag_len > 4) {
1403 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1412 secattr->attr.mls.cat =
1404 if (secattr->mls_cat == NULL) 1413 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1414 if (secattr->attr.mls.cat == NULL)
1405 return -ENOMEM; 1415 return -ENOMEM;
1406 1416
1407 ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, 1417 ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1409,7 +1419,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
1409 tag_len - 4, 1419 tag_len - 4,
1410 secattr); 1420 secattr);
1411 if (ret_val != 0) { 1421 if (ret_val != 0) {
1412 netlbl_secattr_catmap_free(secattr->mls_cat); 1422 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1413 return ret_val; 1423 return ret_val;
1414 } 1424 }
1415 1425
@@ -1443,7 +1453,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
1443 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) 1453 if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
1444 return -EPERM; 1454 return -EPERM;
1445 1455
1446 ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); 1456 ret_val = cipso_v4_map_lvl_hton(doi_def,
1457 secattr->attr.mls.lvl,
1458 &level);
1447 if (ret_val != 0) 1459 if (ret_val != 0)
1448 return ret_val; 1460 return ret_val;
1449 1461
@@ -1488,12 +1500,13 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
1488 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); 1500 ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
1489 if (ret_val != 0) 1501 if (ret_val != 0)
1490 return ret_val; 1502 return ret_val;
1491 secattr->mls_lvl = level; 1503 secattr->attr.mls.lvl = level;
1492 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 1504 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
1493 1505
1494 if (tag_len > 4) { 1506 if (tag_len > 4) {
1495 secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); 1507 secattr->attr.mls.cat =
1496 if (secattr->mls_cat == NULL) 1508 netlbl_secattr_catmap_alloc(GFP_ATOMIC);
1509 if (secattr->attr.mls.cat == NULL)
1497 return -ENOMEM; 1510 return -ENOMEM;
1498 1511
1499 ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, 1512 ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1501,7 +1514,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
1501 tag_len - 4, 1514 tag_len - 4,
1502 secattr); 1515 secattr);
1503 if (ret_val != 0) { 1516 if (ret_val != 0) {
1504 netlbl_secattr_catmap_free(secattr->mls_cat); 1517 netlbl_secattr_catmap_free(secattr->attr.mls.cat);
1505 return ret_val; 1518 return ret_val;
1506 } 1519 }
1507 1520
@@ -1850,6 +1863,8 @@ static int cipso_v4_getattr(const unsigned char *cipso,
1850 ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); 1863 ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
1851 break; 1864 break;
1852 } 1865 }
1866 if (ret_val == 0)
1867 secattr->type = NETLBL_NLTYPE_CIPSOV4;
1853 1868
1854getattr_return: 1869getattr_return:
1855 rcu_read_unlock(); 1870 rcu_read_unlock();
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4fc0b023cfd7..6cae5475737e 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -99,7 +99,7 @@ config IP6_NF_MATCH_HL
99config IP6_NF_MATCH_IPV6HEADER 99config IP6_NF_MATCH_IPV6HEADER
100 tristate '"ipv6header" IPv6 Extension Headers Match' 100 tristate '"ipv6header" IPv6 Extension Headers Match'
101 depends on IP6_NF_IPTABLES 101 depends on IP6_NF_IPTABLES
102 depends on NETFILTER_ADVANCED 102 default m if NETFILTER_ADVANCED=n
103 help 103 help
104 This module allows one to match packets based upon 104 This module allows one to match packets based upon
105 the ipv6 extension headers. 105 the ipv6 extension headers.
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index b11b3ecbb39d..7708e2084ce2 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -72,12 +72,13 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info)
72 return false; 72 return false;
73 } 73 }
74 74
75 err = selinux_relabel_packet_permission(sel->selsid); 75 err = selinux_secmark_relabel_packet_permission(sel->selsid);
76 if (err) { 76 if (err) {
77 printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); 77 printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
78 return false; 78 return false;
79 } 79 }
80 80
81 selinux_secmark_refcount_inc();
81 return true; 82 return true;
82} 83}
83 84
@@ -110,11 +111,20 @@ secmark_tg_check(const char *tablename, const void *entry,
110 return true; 111 return true;
111} 112}
112 113
114void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
115{
116 switch (mode) {
117 case SECMARK_MODE_SEL:
118 selinux_secmark_refcount_dec();
119 }
120}
121
113static struct xt_target secmark_tg_reg[] __read_mostly = { 122static struct xt_target secmark_tg_reg[] __read_mostly = {
114 { 123 {
115 .name = "SECMARK", 124 .name = "SECMARK",
116 .family = AF_INET, 125 .family = AF_INET,
117 .checkentry = secmark_tg_check, 126 .checkentry = secmark_tg_check,
127 .destroy = secmark_tg_destroy,
118 .target = secmark_tg, 128 .target = secmark_tg,
119 .targetsize = sizeof(struct xt_secmark_target_info), 129 .targetsize = sizeof(struct xt_secmark_target_info),
120 .table = "mangle", 130 .table = "mangle",
@@ -124,6 +134,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
124 .name = "SECMARK", 134 .name = "SECMARK",
125 .family = AF_INET6, 135 .family = AF_INET6,
126 .checkentry = secmark_tg_check, 136 .checkentry = secmark_tg_check,
137 .destroy = secmark_tg_destroy,
127 .target = secmark_tg, 138 .target = secmark_tg,
128 .targetsize = sizeof(struct xt_secmark_target_info), 139 .targetsize = sizeof(struct xt_secmark_target_info),
129 .table = "mangle", 140 .table = "mangle",
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index ba0ca8d3f77d..becf91a952ae 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -38,6 +38,7 @@
38#include <net/genetlink.h> 38#include <net/genetlink.h>
39#include <net/netlabel.h> 39#include <net/netlabel.h>
40#include <net/cipso_ipv4.h> 40#include <net/cipso_ipv4.h>
41#include <asm/atomic.h>
41 42
42#include "netlabel_user.h" 43#include "netlabel_user.h"
43#include "netlabel_cipso_v4.h" 44#include "netlabel_cipso_v4.h"
@@ -421,7 +422,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
421 break; 422 break;
422 } 423 }
423 if (ret_val == 0) 424 if (ret_val == 0)
424 netlbl_mgmt_protocount_inc(); 425 atomic_inc(&netlabel_mgmt_protocount);
425 426
426 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, 427 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
427 &audit_info); 428 &audit_info);
@@ -698,7 +699,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
698 &audit_info, 699 &audit_info,
699 netlbl_cipsov4_doi_free); 700 netlbl_cipsov4_doi_free);
700 if (ret_val == 0) 701 if (ret_val == 0)
701 netlbl_mgmt_protocount_dec(); 702 atomic_dec(&netlabel_mgmt_protocount);
702 703
703 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, 704 audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
704 &audit_info); 705 &audit_info);
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index b3675bd7db33..9a8ea0195c4f 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -54,9 +54,6 @@ struct netlbl_domhsh_tbl {
54 * hash table should be okay */ 54 * hash table should be okay */
55static DEFINE_SPINLOCK(netlbl_domhsh_lock); 55static DEFINE_SPINLOCK(netlbl_domhsh_lock);
56static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; 56static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
57
58/* Default domain mapping */
59static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
60static struct netlbl_dom_map *netlbl_domhsh_def = NULL; 57static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
61 58
62/* 59/*
@@ -109,17 +106,14 @@ static u32 netlbl_domhsh_hash(const char *key)
109/** 106/**
110 * netlbl_domhsh_search - Search for a domain entry 107 * netlbl_domhsh_search - Search for a domain entry
111 * @domain: the domain 108 * @domain: the domain
112 * @def: return default if no match is found
113 * 109 *
114 * Description: 110 * Description:
115 * Searches the domain hash table and returns a pointer to the hash table 111 * Searches the domain hash table and returns a pointer to the hash table
116 * entry if found, otherwise NULL is returned. If @def is non-zero and a 112 * entry if found, otherwise NULL is returned. The caller is responsibile for
117 * match is not found in the domain hash table the default mapping is returned 113 * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()).
118 * if it exists. The caller is responsibile for the rcu hash table locks
119 * (i.e. the caller much call rcu_read_[un]lock()).
120 * 114 *
121 */ 115 */
122static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) 116static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
123{ 117{
124 u32 bkt; 118 u32 bkt;
125 struct netlbl_dom_map *iter; 119 struct netlbl_dom_map *iter;
@@ -133,10 +127,31 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
133 return iter; 127 return iter;
134 } 128 }
135 129
136 if (def != 0) { 130 return NULL;
137 iter = rcu_dereference(netlbl_domhsh_def); 131}
138 if (iter != NULL && iter->valid) 132
139 return iter; 133/**
134 * netlbl_domhsh_search_def - Search for a domain entry
135 * @domain: the domain
136 * @def: return default if no match is found
137 *
138 * Description:
139 * Searches the domain hash table and returns a pointer to the hash table
140 * entry if an exact match is found, if an exact match is not present in the
141 * hash table then the default entry is returned if valid otherwise NULL is
142 * returned. The caller is responsibile for the rcu hash table locks
143 * (i.e. the caller much call rcu_read_[un]lock()).
144 *
145 */
146static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
147{
148 struct netlbl_dom_map *entry;
149
150 entry = netlbl_domhsh_search(domain);
151 if (entry == NULL) {
152 entry = rcu_dereference(netlbl_domhsh_def);
153 if (entry != NULL && entry->valid)
154 return entry;
140 } 155 }
141 156
142 return NULL; 157 return NULL;
@@ -221,24 +236,22 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
221 INIT_RCU_HEAD(&entry->rcu); 236 INIT_RCU_HEAD(&entry->rcu);
222 237
223 rcu_read_lock(); 238 rcu_read_lock();
239 spin_lock(&netlbl_domhsh_lock);
224 if (entry->domain != NULL) { 240 if (entry->domain != NULL) {
225 bkt = netlbl_domhsh_hash(entry->domain); 241 bkt = netlbl_domhsh_hash(entry->domain);
226 spin_lock(&netlbl_domhsh_lock); 242 if (netlbl_domhsh_search(entry->domain) == NULL)
227 if (netlbl_domhsh_search(entry->domain, 0) == NULL)
228 list_add_tail_rcu(&entry->list, 243 list_add_tail_rcu(&entry->list,
229 &rcu_dereference(netlbl_domhsh)->tbl[bkt]); 244 &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
230 else 245 else
231 ret_val = -EEXIST; 246 ret_val = -EEXIST;
232 spin_unlock(&netlbl_domhsh_lock);
233 } else { 247 } else {
234 INIT_LIST_HEAD(&entry->list); 248 INIT_LIST_HEAD(&entry->list);
235 spin_lock(&netlbl_domhsh_def_lock);
236 if (rcu_dereference(netlbl_domhsh_def) == NULL) 249 if (rcu_dereference(netlbl_domhsh_def) == NULL)
237 rcu_assign_pointer(netlbl_domhsh_def, entry); 250 rcu_assign_pointer(netlbl_domhsh_def, entry);
238 else 251 else
239 ret_val = -EEXIST; 252 ret_val = -EEXIST;
240 spin_unlock(&netlbl_domhsh_def_lock);
241 } 253 }
254 spin_unlock(&netlbl_domhsh_lock);
242 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); 255 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
243 if (audit_buf != NULL) { 256 if (audit_buf != NULL) {
244 audit_log_format(audit_buf, 257 audit_log_format(audit_buf,
@@ -307,7 +320,10 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
307 struct audit_buffer *audit_buf; 320 struct audit_buffer *audit_buf;
308 321
309 rcu_read_lock(); 322 rcu_read_lock();
310 entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1)); 323 if (domain)
324 entry = netlbl_domhsh_search(domain);
325 else
326 entry = netlbl_domhsh_search_def(domain);
311 if (entry == NULL) 327 if (entry == NULL)
312 goto remove_return; 328 goto remove_return;
313 switch (entry->type) { 329 switch (entry->type) {
@@ -316,23 +332,16 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
316 entry->domain); 332 entry->domain);
317 break; 333 break;
318 } 334 }
319 if (entry != rcu_dereference(netlbl_domhsh_def)) { 335 spin_lock(&netlbl_domhsh_lock);
320 spin_lock(&netlbl_domhsh_lock); 336 if (entry->valid) {
321 if (entry->valid) { 337 entry->valid = 0;
322 entry->valid = 0; 338 if (entry != rcu_dereference(netlbl_domhsh_def))
323 list_del_rcu(&entry->list); 339 list_del_rcu(&entry->list);
324 ret_val = 0; 340 else
325 }
326 spin_unlock(&netlbl_domhsh_lock);
327 } else {
328 spin_lock(&netlbl_domhsh_def_lock);
329 if (entry->valid) {
330 entry->valid = 0;
331 rcu_assign_pointer(netlbl_domhsh_def, NULL); 341 rcu_assign_pointer(netlbl_domhsh_def, NULL);
332 ret_val = 0; 342 ret_val = 0;
333 }
334 spin_unlock(&netlbl_domhsh_def_lock);
335 } 343 }
344 spin_unlock(&netlbl_domhsh_lock);
336 345
337 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); 346 audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
338 if (audit_buf != NULL) { 347 if (audit_buf != NULL) {
@@ -377,7 +386,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
377 */ 386 */
378struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) 387struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
379{ 388{
380 return netlbl_domhsh_search(domain, 1); 389 return netlbl_domhsh_search_def(domain);
381} 390}
382 391
383/** 392/**
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 4f50949722a9..c69e3e1f05c3 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -34,6 +34,7 @@
34#include <net/netlabel.h> 34#include <net/netlabel.h>
35#include <net/cipso_ipv4.h> 35#include <net/cipso_ipv4.h>
36#include <asm/bug.h> 36#include <asm/bug.h>
37#include <asm/atomic.h>
37 38
38#include "netlabel_domainhash.h" 39#include "netlabel_domainhash.h"
39#include "netlabel_unlabeled.h" 40#include "netlabel_unlabeled.h"
@@ -262,7 +263,7 @@ int netlbl_enabled(void)
262 /* At some point we probably want to expose this mechanism to the user 263 /* At some point we probably want to expose this mechanism to the user
263 * as well so that admins can toggle NetLabel regardless of the 264 * as well so that admins can toggle NetLabel regardless of the
264 * configuration */ 265 * configuration */
265 return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0); 266 return (atomic_read(&netlabel_mgmt_protocount) > 0);
266} 267}
267 268
268/** 269/**
@@ -311,7 +312,7 @@ socket_setattr_return:
311 * @secattr: the security attributes 312 * @secattr: the security attributes
312 * 313 *
313 * Description: 314 * Description:
314 * Examines the given sock to see any NetLabel style labeling has been 315 * Examines the given sock to see if any NetLabel style labeling has been
315 * applied to the sock, if so it parses the socket label and returns the 316 * applied to the sock, if so it parses the socket label and returns the
316 * security attributes in @secattr. Returns zero on success, negative values 317 * security attributes in @secattr. Returns zero on success, negative values
317 * on failure. 318 * on failure.
@@ -319,18 +320,13 @@ socket_setattr_return:
319 */ 320 */
320int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) 321int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
321{ 322{
322 int ret_val; 323 return cipso_v4_sock_getattr(sk, secattr);
323
324 ret_val = cipso_v4_sock_getattr(sk, secattr);
325 if (ret_val == 0)
326 return 0;
327
328 return netlbl_unlabel_getattr(secattr);
329} 324}
330 325
331/** 326/**
332 * netlbl_skbuff_getattr - Determine the security attributes of a packet 327 * netlbl_skbuff_getattr - Determine the security attributes of a packet
333 * @skb: the packet 328 * @skb: the packet
329 * @family: protocol family
334 * @secattr: the security attributes 330 * @secattr: the security attributes
335 * 331 *
336 * Description: 332 * Description:
@@ -341,13 +337,14 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
341 * 337 *
342 */ 338 */
343int netlbl_skbuff_getattr(const struct sk_buff *skb, 339int netlbl_skbuff_getattr(const struct sk_buff *skb,
340 u16 family,
344 struct netlbl_lsm_secattr *secattr) 341 struct netlbl_lsm_secattr *secattr)
345{ 342{
346 if (CIPSO_V4_OPTEXIST(skb) && 343 if (CIPSO_V4_OPTEXIST(skb) &&
347 cipso_v4_skbuff_getattr(skb, secattr) == 0) 344 cipso_v4_skbuff_getattr(skb, secattr) == 0)
348 return 0; 345 return 0;
349 346
350 return netlbl_unlabel_getattr(secattr); 347 return netlbl_unlabel_getattr(skb, family, secattr);
351} 348}
352 349
353/** 350/**
@@ -431,6 +428,10 @@ static int __init netlbl_init(void)
431 if (ret_val != 0) 428 if (ret_val != 0)
432 goto init_failure; 429 goto init_failure;
433 430
431 ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
432 if (ret_val != 0)
433 goto init_failure;
434
434 ret_val = netlbl_netlink_init(); 435 ret_val = netlbl_netlink_init();
435 if (ret_val != 0) 436 if (ret_val != 0)
436 goto init_failure; 437 goto init_failure;
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 9c41464d58d1..e2258dc3c845 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -37,14 +37,14 @@
37#include <net/genetlink.h> 37#include <net/genetlink.h>
38#include <net/netlabel.h> 38#include <net/netlabel.h>
39#include <net/cipso_ipv4.h> 39#include <net/cipso_ipv4.h>
40#include <asm/atomic.h>
40 41
41#include "netlabel_domainhash.h" 42#include "netlabel_domainhash.h"
42#include "netlabel_user.h" 43#include "netlabel_user.h"
43#include "netlabel_mgmt.h" 44#include "netlabel_mgmt.h"
44 45
45/* NetLabel configured protocol count */ 46/* NetLabel configured protocol counter */
46static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock); 47atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
47static u32 netlabel_mgmt_protocount = 0;
48 48
49/* Argument struct for netlbl_domhsh_walk() */ 49/* Argument struct for netlbl_domhsh_walk() */
50struct netlbl_domhsh_walk_arg { 50struct netlbl_domhsh_walk_arg {
@@ -71,63 +71,6 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
71}; 71};
72 72
73/* 73/*
74 * NetLabel Misc Management Functions
75 */
76
77/**
78 * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count
79 *
80 * Description:
81 * Increment the number of labeled protocol configurations in the current
82 * NetLabel configuration. Keep track of this for use in determining if
83 * NetLabel label enforcement should be active/enabled or not in the LSM.
84 *
85 */
86void netlbl_mgmt_protocount_inc(void)
87{
88 spin_lock(&netlabel_mgmt_protocount_lock);
89 netlabel_mgmt_protocount++;
90 spin_unlock(&netlabel_mgmt_protocount_lock);
91}
92
93/**
94 * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count
95 *
96 * Description:
97 * Decrement the number of labeled protocol configurations in the current
98 * NetLabel configuration. Keep track of this for use in determining if
99 * NetLabel label enforcement should be active/enabled or not in the LSM.
100 *
101 */
102void netlbl_mgmt_protocount_dec(void)
103{
104 spin_lock(&netlabel_mgmt_protocount_lock);
105 if (netlabel_mgmt_protocount > 0)
106 netlabel_mgmt_protocount--;
107 spin_unlock(&netlabel_mgmt_protocount_lock);
108}
109
110/**
111 * netlbl_mgmt_protocount_value - Return the number of configured protocols
112 *
113 * Description:
114 * Return the number of labeled protocols in the current NetLabel
115 * configuration. This value is useful in determining if NetLabel label
116 * enforcement should be active/enabled or not in the LSM.
117 *
118 */
119u32 netlbl_mgmt_protocount_value(void)
120{
121 u32 val;
122
123 rcu_read_lock();
124 val = netlabel_mgmt_protocount;
125 rcu_read_unlock();
126
127 return val;
128}
129
130/*
131 * NetLabel Command Handlers 74 * NetLabel Command Handlers
132 */ 75 */
133 76
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index ccb2b3923591..a43bff169d6b 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -32,6 +32,7 @@
32#define _NETLABEL_MGMT_H 32#define _NETLABEL_MGMT_H
33 33
34#include <net/netlabel.h> 34#include <net/netlabel.h>
35#include <asm/atomic.h>
35 36
36/* 37/*
37 * The following NetLabel payloads are supported by the management interface. 38 * The following NetLabel payloads are supported by the management interface.
@@ -168,9 +169,7 @@ enum {
168/* NetLabel protocol functions */ 169/* NetLabel protocol functions */
169int netlbl_mgmt_genl_init(void); 170int netlbl_mgmt_genl_init(void);
170 171
171/* NetLabel misc management functions */ 172/* NetLabel configured protocol reference counter */
172void netlbl_mgmt_protocount_inc(void); 173extern atomic_t netlabel_mgmt_protocount;
173void netlbl_mgmt_protocount_dec(void);
174u32 netlbl_mgmt_protocount_value(void);
175 174
176#endif 175#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 348292450deb..42e81fd8cc49 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12/* 12/*
13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 13 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
14 * 14 *
15 * This program is free software; you can redistribute it and/or modify 15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 16 * it under the terms of the GNU General Public License as published by
@@ -36,22 +36,92 @@
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/skbuff.h> 37#include <linux/skbuff.h>
38#include <linux/audit.h> 38#include <linux/audit.h>
39#include <linux/in.h>
40#include <linux/in6.h>
41#include <linux/ip.h>
42#include <linux/ipv6.h>
43#include <linux/notifier.h>
44#include <linux/netdevice.h>
45#include <linux/security.h>
39#include <net/sock.h> 46#include <net/sock.h>
40#include <net/netlink.h> 47#include <net/netlink.h>
41#include <net/genetlink.h> 48#include <net/genetlink.h>
42 49#include <net/ip.h>
50#include <net/ipv6.h>
51#include <net/net_namespace.h>
43#include <net/netlabel.h> 52#include <net/netlabel.h>
44#include <asm/bug.h> 53#include <asm/bug.h>
54#include <asm/atomic.h>
45 55
46#include "netlabel_user.h" 56#include "netlabel_user.h"
47#include "netlabel_domainhash.h" 57#include "netlabel_domainhash.h"
48#include "netlabel_unlabeled.h" 58#include "netlabel_unlabeled.h"
59#include "netlabel_mgmt.h"
60
61/* NOTE: at present we always use init's network namespace since we don't
62 * presently support different namespaces even though the majority of
63 * the functions in this file are "namespace safe" */
64
65/* The unlabeled connection hash table which we use to map network interfaces
66 * and addresses of unlabeled packets to a user specified secid value for the
67 * LSM. The hash table is used to lookup the network interface entry
68 * (struct netlbl_unlhsh_iface) and then the interface entry is used to
69 * lookup an IP address match from an ordered list. If a network interface
70 * match can not be found in the hash table then the default entry
71 * (netlbl_unlhsh_def) is used. The IP address entry list
72 * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
73 * larger netmask come first.
74 */
75struct netlbl_unlhsh_tbl {
76 struct list_head *tbl;
77 u32 size;
78};
79struct netlbl_unlhsh_addr4 {
80 __be32 addr;
81 __be32 mask;
82 u32 secid;
83
84 u32 valid;
85 struct list_head list;
86 struct rcu_head rcu;
87};
88struct netlbl_unlhsh_addr6 {
89 struct in6_addr addr;
90 struct in6_addr mask;
91 u32 secid;
92
93 u32 valid;
94 struct list_head list;
95 struct rcu_head rcu;
96};
97struct netlbl_unlhsh_iface {
98 int ifindex;
99 struct list_head addr4_list;
100 struct list_head addr6_list;
101
102 u32 valid;
103 struct list_head list;
104 struct rcu_head rcu;
105};
106
107/* Argument struct for netlbl_unlhsh_walk() */
108struct netlbl_unlhsh_walk_arg {
109 struct netlink_callback *nl_cb;
110 struct sk_buff *skb;
111 u32 seq;
112};
113
114/* Unlabeled connection hash table */
115/* updates should be so rare that having one spinlock for the entire
116 * hash table should be okay */
117static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
118static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
119static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
49 120
50/* Accept unlabeled packets flag */ 121/* Accept unlabeled packets flag */
51static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock);
52static u8 netlabel_unlabel_acceptflg = 0; 122static u8 netlabel_unlabel_acceptflg = 0;
53 123
54/* NetLabel Generic NETLINK CIPSOv4 family */ 124/* NetLabel Generic NETLINK unlabeled family */
55static struct genl_family netlbl_unlabel_gnl_family = { 125static struct genl_family netlbl_unlabel_gnl_family = {
56 .id = GENL_ID_GENERATE, 126 .id = GENL_ID_GENERATE,
57 .hdrsize = 0, 127 .hdrsize = 0,
@@ -63,11 +133,841 @@ static struct genl_family netlbl_unlabel_gnl_family = {
63/* NetLabel Netlink attribute policy */ 133/* NetLabel Netlink attribute policy */
64static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { 134static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
65 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, 135 [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
136 [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
137 .len = sizeof(struct in6_addr) },
138 [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
139 .len = sizeof(struct in6_addr) },
140 [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
141 .len = sizeof(struct in_addr) },
142 [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
143 .len = sizeof(struct in_addr) },
144 [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
145 .len = IFNAMSIZ - 1 },
146 [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
66}; 147};
67 148
68/* 149/*
69 * Helper Functions 150 * Audit Helper Functions
151 */
152
153/**
154 * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
155 * @audit_buf: audit buffer
156 * @dev: network interface
157 * @addr: IP address
158 * @mask: IP address mask
159 *
160 * Description:
161 * Write the IPv4 address and address mask, if necessary, to @audit_buf.
162 *
163 */
164static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
165 const char *dev,
166 __be32 addr, __be32 mask)
167{
168 u32 mask_val = ntohl(mask);
169
170 if (dev != NULL)
171 audit_log_format(audit_buf, " netif=%s", dev);
172 audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
173 if (mask_val != 0xffffffff) {
174 u32 mask_len = 0;
175 while (mask_val > 0) {
176 mask_val <<= 1;
177 mask_len++;
178 }
179 audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
180 }
181}
182
183/**
184 * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
185 * @audit_buf: audit buffer
186 * @dev: network interface
187 * @addr: IP address
188 * @mask: IP address mask
189 *
190 * Description:
191 * Write the IPv6 address and address mask, if necessary, to @audit_buf.
192 *
193 */
194static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
195 const char *dev,
196 const struct in6_addr *addr,
197 const struct in6_addr *mask)
198{
199 if (dev != NULL)
200 audit_log_format(audit_buf, " netif=%s", dev);
201 audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
202 if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
203 u32 mask_len = 0;
204 u32 mask_val;
205 int iter = -1;
206 while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
207 mask_len += 32;
208 mask_val = ntohl(mask->s6_addr32[iter]);
209 while (mask_val > 0) {
210 mask_val <<= 1;
211 mask_len++;
212 }
213 audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
214 }
215}
216
217/*
218 * Unlabeled Connection Hash Table Functions
219 */
220
221/**
222 * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table
223 * @entry: the entry's RCU field
224 *
225 * Description:
226 * This function is designed to be used as a callback to the call_rcu()
227 * function so that memory allocated to a hash table address entry can be
228 * released safely.
229 *
230 */
231static void netlbl_unlhsh_free_addr4(struct rcu_head *entry)
232{
233 struct netlbl_unlhsh_addr4 *ptr;
234
235 ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu);
236 kfree(ptr);
237}
238
239#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240/**
241 * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table
242 * @entry: the entry's RCU field
243 *
244 * Description:
245 * This function is designed to be used as a callback to the call_rcu()
246 * function so that memory allocated to a hash table address entry can be
247 * released safely.
248 *
249 */
250static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
251{
252 struct netlbl_unlhsh_addr6 *ptr;
253
254 ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu);
255 kfree(ptr);
256}
257#endif /* IPv6 */
258
259/**
260 * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
261 * @entry: the entry's RCU field
262 *
263 * Description:
264 * This function is designed to be used as a callback to the call_rcu()
265 * function so that memory allocated to a hash table interface entry can be
266 * released safely. It is important to note that this function does not free
267 * the IPv4 and IPv6 address lists contained as part of an interface entry. It
268 * is up to the rest of the code to make sure an interface entry is only freed
269 * once it's address lists are empty.
270 *
271 */
272static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
273{
274 struct netlbl_unlhsh_iface *iface;
275 struct netlbl_unlhsh_addr4 *iter4;
276 struct netlbl_unlhsh_addr4 *tmp4;
277 struct netlbl_unlhsh_addr6 *iter6;
278 struct netlbl_unlhsh_addr6 *tmp6;
279
280 iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
281
282 /* no need for locks here since we are the only one with access to this
283 * structure */
284
285 list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
286 if (iter4->valid) {
287 list_del_rcu(&iter4->list);
288 kfree(iter4);
289 }
290 list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
291 if (iter6->valid) {
292 list_del_rcu(&iter6->list);
293 kfree(iter6);
294 }
295 kfree(iface);
296}
297
298/**
299 * netlbl_unlhsh_hash - Hashing function for the hash table
300 * @ifindex: the network interface/device to hash
301 *
302 * Description:
303 * This is the hashing function for the unlabeled hash table, it returns the
304 * bucket number for the given device/interface. The caller is responsible for
305 * calling the rcu_read_[un]lock() functions.
306 *
70 */ 307 */
308static u32 netlbl_unlhsh_hash(int ifindex)
309{
310 /* this is taken _almost_ directly from
311 * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much
312 * the same thing */
313 return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
314}
315
316/**
317 * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
318 * @addr: IPv4 address
319 * @iface: the network interface entry
320 *
321 * Description:
322 * Searches the IPv4 address list of the network interface specified by @iface.
323 * If a matching address entry is found it is returned, otherwise NULL is
324 * returned. The caller is responsible for calling the rcu_read_[un]lock()
325 * functions.
326 *
327 */
328static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
329 __be32 addr,
330 const struct netlbl_unlhsh_iface *iface)
331{
332 struct netlbl_unlhsh_addr4 *iter;
333
334 list_for_each_entry_rcu(iter, &iface->addr4_list, list)
335 if (iter->valid && (addr & iter->mask) == iter->addr)
336 return iter;
337
338 return NULL;
339}
340
341#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
342/**
343 * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
344 * @addr: IPv6 address
345 * @iface: the network interface entry
346 *
347 * Description:
348 * Searches the IPv6 address list of the network interface specified by @iface.
349 * If a matching address entry is found it is returned, otherwise NULL is
350 * returned. The caller is responsible for calling the rcu_read_[un]lock()
351 * functions.
352 *
353 */
354static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
355 const struct in6_addr *addr,
356 const struct netlbl_unlhsh_iface *iface)
357{
358 struct netlbl_unlhsh_addr6 *iter;
359
360 list_for_each_entry_rcu(iter, &iface->addr6_list, list)
361 if (iter->valid &&
362 ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
363 return iter;
364
365 return NULL;
366}
367#endif /* IPv6 */
368
369/**
370 * netlbl_unlhsh_search_iface - Search for a matching interface entry
371 * @ifindex: the network interface
372 *
373 * Description:
374 * Searches the unlabeled connection hash table and returns a pointer to the
375 * interface entry which matches @ifindex, otherwise NULL is returned. The
376 * caller is responsible for calling the rcu_read_[un]lock() functions.
377 *
378 */
379static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
380{
381 u32 bkt;
382 struct netlbl_unlhsh_iface *iter;
383
384 bkt = netlbl_unlhsh_hash(ifindex);
385 list_for_each_entry_rcu(iter,
386 &rcu_dereference(netlbl_unlhsh)->tbl[bkt],
387 list)
388 if (iter->valid && iter->ifindex == ifindex)
389 return iter;
390
391 return NULL;
392}
393
394/**
395 * netlbl_unlhsh_search_iface_def - Search for a matching interface entry
396 * @ifindex: the network interface
397 *
398 * Description:
399 * Searches the unlabeled connection hash table and returns a pointer to the
400 * interface entry which matches @ifindex. If an exact match can not be found
401 * and there is a valid default entry, the default entry is returned, otherwise
402 * NULL is returned. The caller is responsible for calling the
403 * rcu_read_[un]lock() functions.
404 *
405 */
406static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex)
407{
408 struct netlbl_unlhsh_iface *entry;
409
410 entry = netlbl_unlhsh_search_iface(ifindex);
411 if (entry != NULL)
412 return entry;
413
414 entry = rcu_dereference(netlbl_unlhsh_def);
415 if (entry != NULL && entry->valid)
416 return entry;
417
418 return NULL;
419}
420
421/**
422 * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
423 * @iface: the associated interface entry
424 * @addr: IPv4 address in network byte order
425 * @mask: IPv4 address mask in network byte order
426 * @secid: LSM secid value for entry
427 *
428 * Description:
429 * Add a new address entry into the unlabeled connection hash table using the
430 * interface entry specified by @iface. On success zero is returned, otherwise
431 * a negative value is returned. The caller is responsible for calling the
432 * rcu_read_[un]lock() functions.
433 *
434 */
435static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
436 const struct in_addr *addr,
437 const struct in_addr *mask,
438 u32 secid)
439{
440 struct netlbl_unlhsh_addr4 *entry;
441 struct netlbl_unlhsh_addr4 *iter;
442
443 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
444 if (entry == NULL)
445 return -ENOMEM;
446
447 entry->addr = addr->s_addr & mask->s_addr;
448 entry->mask = mask->s_addr;
449 entry->secid = secid;
450 entry->valid = 1;
451 INIT_RCU_HEAD(&entry->rcu);
452
453 spin_lock(&netlbl_unlhsh_lock);
454 iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
455 if (iter != NULL &&
456 iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
457 spin_unlock(&netlbl_unlhsh_lock);
458 kfree(entry);
459 return -EEXIST;
460 }
461 /* in order to speed up address searches through the list (the common
462 * case) we need to keep the list in order based on the size of the
463 * address mask such that the entry with the widest mask (smallest
464 * numerical value) appears first in the list */
465 list_for_each_entry_rcu(iter, &iface->addr4_list, list)
466 if (iter->valid &&
467 ntohl(entry->mask) > ntohl(iter->mask)) {
468 __list_add_rcu(&entry->list,
469 iter->list.prev,
470 &iter->list);
471 spin_unlock(&netlbl_unlhsh_lock);
472 return 0;
473 }
474 list_add_tail_rcu(&entry->list, &iface->addr4_list);
475 spin_unlock(&netlbl_unlhsh_lock);
476 return 0;
477}
478
479#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
480/**
481 * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
482 * @iface: the associated interface entry
483 * @addr: IPv6 address in network byte order
484 * @mask: IPv6 address mask in network byte order
485 * @secid: LSM secid value for entry
486 *
487 * Description:
488 * Add a new address entry into the unlabeled connection hash table using the
489 * interface entry specified by @iface. On success zero is returned, otherwise
490 * a negative value is returned. The caller is responsible for calling the
491 * rcu_read_[un]lock() functions.
492 *
493 */
494static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
495 const struct in6_addr *addr,
496 const struct in6_addr *mask,
497 u32 secid)
498{
499 struct netlbl_unlhsh_addr6 *entry;
500 struct netlbl_unlhsh_addr6 *iter;
501
502 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
503 if (entry == NULL)
504 return -ENOMEM;
505
506 ipv6_addr_copy(&entry->addr, addr);
507 entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
508 entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
509 entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
510 entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
511 ipv6_addr_copy(&entry->mask, mask);
512 entry->secid = secid;
513 entry->valid = 1;
514 INIT_RCU_HEAD(&entry->rcu);
515
516 spin_lock(&netlbl_unlhsh_lock);
517 iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
518 if (iter != NULL &&
519 (ipv6_addr_equal(&iter->addr, addr) &&
520 ipv6_addr_equal(&iter->mask, mask))) {
521 spin_unlock(&netlbl_unlhsh_lock);
522 kfree(entry);
523 return -EEXIST;
524 }
525 /* in order to speed up address searches through the list (the common
526 * case) we need to keep the list in order based on the size of the
527 * address mask such that the entry with the widest mask (smallest
528 * numerical value) appears first in the list */
529 list_for_each_entry_rcu(iter, &iface->addr6_list, list)
530 if (iter->valid &&
531 ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
532 __list_add_rcu(&entry->list,
533 iter->list.prev,
534 &iter->list);
535 spin_unlock(&netlbl_unlhsh_lock);
536 return 0;
537 }
538 list_add_tail_rcu(&entry->list, &iface->addr6_list);
539 spin_unlock(&netlbl_unlhsh_lock);
540 return 0;
541}
542#endif /* IPv6 */
543
544/**
545 * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
546 * @ifindex: network interface
547 *
548 * Description:
549 * Add a new, empty, interface entry into the unlabeled connection hash table.
550 * On success a pointer to the new interface entry is returned, on failure NULL
551 * is returned. The caller is responsible for calling the rcu_read_[un]lock()
552 * functions.
553 *
554 */
555static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
556{
557 u32 bkt;
558 struct netlbl_unlhsh_iface *iface;
559
560 iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
561 if (iface == NULL)
562 return NULL;
563
564 iface->ifindex = ifindex;
565 INIT_LIST_HEAD(&iface->addr4_list);
566 INIT_LIST_HEAD(&iface->addr6_list);
567 iface->valid = 1;
568 INIT_RCU_HEAD(&iface->rcu);
569
570 spin_lock(&netlbl_unlhsh_lock);
571 if (ifindex > 0) {
572 bkt = netlbl_unlhsh_hash(ifindex);
573 if (netlbl_unlhsh_search_iface(ifindex) != NULL)
574 goto add_iface_failure;
575 list_add_tail_rcu(&iface->list,
576 &rcu_dereference(netlbl_unlhsh)->tbl[bkt]);
577 } else {
578 INIT_LIST_HEAD(&iface->list);
579 if (rcu_dereference(netlbl_unlhsh_def) != NULL)
580 goto add_iface_failure;
581 rcu_assign_pointer(netlbl_unlhsh_def, iface);
582 }
583 spin_unlock(&netlbl_unlhsh_lock);
584
585 return iface;
586
587add_iface_failure:
588 spin_unlock(&netlbl_unlhsh_lock);
589 kfree(iface);
590 return NULL;
591}
592
593/**
594 * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
595 * @net: network namespace
596 * @dev_name: interface name
597 * @addr: IP address in network byte order
598 * @mask: address mask in network byte order
599 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
600 * @secid: LSM secid value for the entry
601 * @audit_info: NetLabel audit information
602 *
603 * Description:
604 * Adds a new entry to the unlabeled connection hash table. Returns zero on
605 * success, negative values on failure.
606 *
607 */
608static int netlbl_unlhsh_add(struct net *net,
609 const char *dev_name,
610 const void *addr,
611 const void *mask,
612 u32 addr_len,
613 u32 secid,
614 struct netlbl_audit *audit_info)
615{
616 int ret_val;
617 int ifindex;
618 struct net_device *dev;
619 struct netlbl_unlhsh_iface *iface;
620 struct in_addr *addr4, *mask4;
621 struct in6_addr *addr6, *mask6;
622 struct audit_buffer *audit_buf = NULL;
623 char *secctx = NULL;
624 u32 secctx_len;
625
626 if (addr_len != sizeof(struct in_addr) &&
627 addr_len != sizeof(struct in6_addr))
628 return -EINVAL;
629
630 rcu_read_lock();
631 if (dev_name != NULL) {
632 dev = dev_get_by_name(net, dev_name);
633 if (dev == NULL) {
634 ret_val = -ENODEV;
635 goto unlhsh_add_return;
636 }
637 ifindex = dev->ifindex;
638 dev_put(dev);
639 iface = netlbl_unlhsh_search_iface(ifindex);
640 } else {
641 ifindex = 0;
642 iface = rcu_dereference(netlbl_unlhsh_def);
643 }
644 if (iface == NULL) {
645 iface = netlbl_unlhsh_add_iface(ifindex);
646 if (iface == NULL) {
647 ret_val = -ENOMEM;
648 goto unlhsh_add_return;
649 }
650 }
651 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
652 audit_info);
653 switch (addr_len) {
654 case sizeof(struct in_addr):
655 addr4 = (struct in_addr *)addr;
656 mask4 = (struct in_addr *)mask;
657 ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
658 if (audit_buf != NULL)
659 netlbl_unlabel_audit_addr4(audit_buf,
660 dev_name,
661 addr4->s_addr,
662 mask4->s_addr);
663 break;
664#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
665 case sizeof(struct in6_addr):
666 addr6 = (struct in6_addr *)addr;
667 mask6 = (struct in6_addr *)mask;
668 ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
669 if (audit_buf != NULL)
670 netlbl_unlabel_audit_addr6(audit_buf,
671 dev_name,
672 addr6, mask6);
673 break;
674#endif /* IPv6 */
675 default:
676 ret_val = -EINVAL;
677 }
678 if (ret_val == 0)
679 atomic_inc(&netlabel_mgmt_protocount);
680
681unlhsh_add_return:
682 rcu_read_unlock();
683 if (audit_buf != NULL) {
684 if (security_secid_to_secctx(secid,
685 &secctx,
686 &secctx_len) == 0) {
687 audit_log_format(audit_buf, " sec_obj=%s", secctx);
688 security_release_secctx(secctx, secctx_len);
689 }
690 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
691 audit_log_end(audit_buf);
692 }
693 return ret_val;
694}
695
696/**
697 * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
698 * @net: network namespace
699 * @iface: interface entry
700 * @addr: IP address
701 * @mask: IP address mask
702 * @audit_info: NetLabel audit information
703 *
704 * Description:
705 * Remove an IP address entry from the unlabeled connection hash table.
706 * Returns zero on success, negative values on failure. The caller is
707 * responsible for calling the rcu_read_[un]lock() functions.
708 *
709 */
710static int netlbl_unlhsh_remove_addr4(struct net *net,
711 struct netlbl_unlhsh_iface *iface,
712 const struct in_addr *addr,
713 const struct in_addr *mask,
714 struct netlbl_audit *audit_info)
715{
716 int ret_val = -ENOENT;
717 struct netlbl_unlhsh_addr4 *entry;
718 struct audit_buffer *audit_buf = NULL;
719 struct net_device *dev;
720 char *secctx = NULL;
721 u32 secctx_len;
722
723 spin_lock(&netlbl_unlhsh_lock);
724 entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
725 if (entry != NULL &&
726 entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
727 entry->valid = 0;
728 list_del_rcu(&entry->list);
729 ret_val = 0;
730 }
731 spin_unlock(&netlbl_unlhsh_lock);
732
733 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
734 audit_info);
735 if (audit_buf != NULL) {
736 dev = dev_get_by_index(net, iface->ifindex);
737 netlbl_unlabel_audit_addr4(audit_buf,
738 (dev != NULL ? dev->name : NULL),
739 entry->addr, entry->mask);
740 if (dev != NULL)
741 dev_put(dev);
742 if (security_secid_to_secctx(entry->secid,
743 &secctx,
744 &secctx_len) == 0) {
745 audit_log_format(audit_buf, " sec_obj=%s", secctx);
746 security_release_secctx(secctx, secctx_len);
747 }
748 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
749 audit_log_end(audit_buf);
750 }
751
752 if (ret_val == 0)
753 call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4);
754 return ret_val;
755}
756
757#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
758/**
759 * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
760 * @net: network namespace
761 * @iface: interface entry
762 * @addr: IP address
763 * @mask: IP address mask
764 * @audit_info: NetLabel audit information
765 *
766 * Description:
767 * Remove an IP address entry from the unlabeled connection hash table.
768 * Returns zero on success, negative values on failure. The caller is
769 * responsible for calling the rcu_read_[un]lock() functions.
770 *
771 */
772static int netlbl_unlhsh_remove_addr6(struct net *net,
773 struct netlbl_unlhsh_iface *iface,
774 const struct in6_addr *addr,
775 const struct in6_addr *mask,
776 struct netlbl_audit *audit_info)
777{
778 int ret_val = -ENOENT;
779 struct netlbl_unlhsh_addr6 *entry;
780 struct audit_buffer *audit_buf = NULL;
781 struct net_device *dev;
782 char *secctx = NULL;
783 u32 secctx_len;
784
785 spin_lock(&netlbl_unlhsh_lock);
786 entry = netlbl_unlhsh_search_addr6(addr, iface);
787 if (entry != NULL &&
788 (ipv6_addr_equal(&entry->addr, addr) &&
789 ipv6_addr_equal(&entry->mask, mask))) {
790 entry->valid = 0;
791 list_del_rcu(&entry->list);
792 ret_val = 0;
793 }
794 spin_unlock(&netlbl_unlhsh_lock);
795
796 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
797 audit_info);
798 if (audit_buf != NULL) {
799 dev = dev_get_by_index(net, iface->ifindex);
800 netlbl_unlabel_audit_addr6(audit_buf,
801 (dev != NULL ? dev->name : NULL),
802 addr, mask);
803 if (dev != NULL)
804 dev_put(dev);
805 if (security_secid_to_secctx(entry->secid,
806 &secctx,
807 &secctx_len) == 0) {
808 audit_log_format(audit_buf, " sec_obj=%s", secctx);
809 security_release_secctx(secctx, secctx_len);
810 }
811 audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
812 audit_log_end(audit_buf);
813 }
814
815 if (ret_val == 0)
816 call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6);
817 return ret_val;
818}
819#endif /* IPv6 */
820
821/**
822 * netlbl_unlhsh_condremove_iface - Remove an interface entry
823 * @iface: the interface entry
824 *
825 * Description:
826 * Remove an interface entry from the unlabeled connection hash table if it is
827 * empty. An interface entry is considered to be empty if there are no
828 * address entries assigned to it.
829 *
830 */
831static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
832{
833 struct netlbl_unlhsh_addr4 *iter4;
834 struct netlbl_unlhsh_addr6 *iter6;
835
836 spin_lock(&netlbl_unlhsh_lock);
837 list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
838 if (iter4->valid)
839 goto unlhsh_condremove_failure;
840 list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
841 if (iter6->valid)
842 goto unlhsh_condremove_failure;
843 iface->valid = 0;
844 if (iface->ifindex > 0)
845 list_del_rcu(&iface->list);
846 else
847 rcu_assign_pointer(netlbl_unlhsh_def, NULL);
848 spin_unlock(&netlbl_unlhsh_lock);
849
850 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
851 return;
852
853unlhsh_condremove_failure:
854 spin_unlock(&netlbl_unlhsh_lock);
855 return;
856}
857
858/**
859 * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
860 * @net: network namespace
861 * @dev_name: interface name
862 * @addr: IP address in network byte order
863 * @mask: address mask in network byte order
864 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
865 * @audit_info: NetLabel audit information
866 *
867 * Description:
868 * Removes and existing entry from the unlabeled connection hash table.
869 * Returns zero on success, negative values on failure.
870 *
871 */
872static int netlbl_unlhsh_remove(struct net *net,
873 const char *dev_name,
874 const void *addr,
875 const void *mask,
876 u32 addr_len,
877 struct netlbl_audit *audit_info)
878{
879 int ret_val;
880 struct net_device *dev;
881 struct netlbl_unlhsh_iface *iface;
882
883 if (addr_len != sizeof(struct in_addr) &&
884 addr_len != sizeof(struct in6_addr))
885 return -EINVAL;
886
887 rcu_read_lock();
888 if (dev_name != NULL) {
889 dev = dev_get_by_name(net, dev_name);
890 if (dev == NULL) {
891 ret_val = -ENODEV;
892 goto unlhsh_remove_return;
893 }
894 iface = netlbl_unlhsh_search_iface(dev->ifindex);
895 dev_put(dev);
896 } else
897 iface = rcu_dereference(netlbl_unlhsh_def);
898 if (iface == NULL) {
899 ret_val = -ENOENT;
900 goto unlhsh_remove_return;
901 }
902 switch (addr_len) {
903 case sizeof(struct in_addr):
904 ret_val = netlbl_unlhsh_remove_addr4(net,
905 iface, addr, mask,
906 audit_info);
907 break;
908#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
909 case sizeof(struct in6_addr):
910 ret_val = netlbl_unlhsh_remove_addr6(net,
911 iface, addr, mask,
912 audit_info);
913 break;
914#endif /* IPv6 */
915 default:
916 ret_val = -EINVAL;
917 }
918 if (ret_val == 0) {
919 netlbl_unlhsh_condremove_iface(iface);
920 atomic_dec(&netlabel_mgmt_protocount);
921 }
922
923unlhsh_remove_return:
924 rcu_read_unlock();
925 return ret_val;
926}
927
928/*
929 * General Helper Functions
930 */
931
932/**
933 * netlbl_unlhsh_netdev_handler - Network device notification handler
934 * @this: notifier block
935 * @event: the event
936 * @ptr: the network device (cast to void)
937 *
938 * Description:
939 * Handle network device events, although at present all we care about is a
940 * network device going away. In the case of a device going away we clear any
941 * related entries from the unlabeled connection hash table.
942 *
943 */
944static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
945 unsigned long event,
946 void *ptr)
947{
948 struct net_device *dev = ptr;
949 struct netlbl_unlhsh_iface *iface = NULL;
950
951 if (dev->nd_net != &init_net)
952 return NOTIFY_DONE;
953
954 /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
955 if (event == NETDEV_DOWN) {
956 spin_lock(&netlbl_unlhsh_lock);
957 iface = netlbl_unlhsh_search_iface(dev->ifindex);
958 if (iface != NULL && iface->valid) {
959 iface->valid = 0;
960 list_del_rcu(&iface->list);
961 } else
962 iface = NULL;
963 spin_unlock(&netlbl_unlhsh_lock);
964 }
965
966 if (iface != NULL)
967 call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
968
969 return NOTIFY_DONE;
970}
71 971
72/** 972/**
73 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag 973 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
@@ -84,11 +984,8 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
84 struct audit_buffer *audit_buf; 984 struct audit_buffer *audit_buf;
85 u8 old_val; 985 u8 old_val;
86 986
87 spin_lock(&netlabel_unlabel_acceptflg_lock);
88 old_val = netlabel_unlabel_acceptflg; 987 old_val = netlabel_unlabel_acceptflg;
89 netlabel_unlabel_acceptflg = value; 988 netlabel_unlabel_acceptflg = value;
90 spin_unlock(&netlabel_unlabel_acceptflg_lock);
91
92 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, 989 audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
93 audit_info); 990 audit_info);
94 if (audit_buf != NULL) { 991 if (audit_buf != NULL) {
@@ -98,6 +995,48 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
98 } 995 }
99} 996}
100 997
998/**
999 * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
1000 * @info: the Generic NETLINK info block
1001 * @addr: the IP address
1002 * @mask: the IP address mask
1003 * @len: the address length
1004 *
1005 * Description:
1006 * Examine the Generic NETLINK message and extract the IP address information.
1007 * Returns zero on success, negative values on failure.
1008 *
1009 */
1010static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
1011 void **addr,
1012 void **mask,
1013 u32 *len)
1014{
1015 u32 addr_len;
1016
1017 if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
1018 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
1019 if (addr_len != sizeof(struct in_addr) &&
1020 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
1021 return -EINVAL;
1022 *len = addr_len;
1023 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
1024 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
1025 return 0;
1026 } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
1027 addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
1028 if (addr_len != sizeof(struct in6_addr) &&
1029 addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
1030 return -EINVAL;
1031 *len = addr_len;
1032 *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
1033 *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
1034 return 0;
1035 }
1036
1037 return -EINVAL;
1038}
1039
101/* 1040/*
102 * NetLabel Command Handlers 1041 * NetLabel Command Handlers
103 */ 1042 */
@@ -155,11 +1094,9 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
155 goto list_failure; 1094 goto list_failure;
156 } 1095 }
157 1096
158 rcu_read_lock();
159 ret_val = nla_put_u8(ans_skb, 1097 ret_val = nla_put_u8(ans_skb,
160 NLBL_UNLABEL_A_ACPTFLG, 1098 NLBL_UNLABEL_A_ACPTFLG,
161 netlabel_unlabel_acceptflg); 1099 netlabel_unlabel_acceptflg);
162 rcu_read_unlock();
163 if (ret_val != 0) 1100 if (ret_val != 0)
164 goto list_failure; 1101 goto list_failure;
165 1102
@@ -175,11 +1112,489 @@ list_failure:
175 return ret_val; 1112 return ret_val;
176} 1113}
177 1114
1115/**
1116 * netlbl_unlabel_staticadd - Handle a STATICADD message
1117 * @skb: the NETLINK buffer
1118 * @info: the Generic NETLINK info block
1119 *
1120 * Description:
1121 * Process a user generated STATICADD message and add a new unlabeled
1122 * connection entry to the hash table. Returns zero on success, negative
1123 * values on failure.
1124 *
1125 */
1126static int netlbl_unlabel_staticadd(struct sk_buff *skb,
1127 struct genl_info *info)
1128{
1129 int ret_val;
1130 char *dev_name;
1131 void *addr;
1132 void *mask;
1133 u32 addr_len;
1134 u32 secid;
1135 struct netlbl_audit audit_info;
1136
1137 /* Don't allow users to add both IPv4 and IPv6 addresses for a
1138 * single entry. However, allow users to create two entries, one each
1139 * for IPv4 and IPv4, with the same LSM security context which should
1140 * achieve the same result. */
1141 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
1142 !info->attrs[NLBL_UNLABEL_A_IFACE] ||
1143 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1144 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1145 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1146 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1147 return -EINVAL;
1148
1149 netlbl_netlink_auditinfo(skb, &audit_info);
1150
1151 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1152 if (ret_val != 0)
1153 return ret_val;
1154 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
1155 ret_val = security_secctx_to_secid(
1156 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1157 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1158 &secid);
1159 if (ret_val != 0)
1160 return ret_val;
1161
1162 return netlbl_unlhsh_add(&init_net,
1163 dev_name, addr, mask, addr_len, secid,
1164 &audit_info);
1165}
1166
1167/**
1168 * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
1169 * @skb: the NETLINK buffer
1170 * @info: the Generic NETLINK info block
1171 *
1172 * Description:
1173 * Process a user generated STATICADDDEF message and add a new default
1174 * unlabeled connection entry. Returns zero on success, negative values on
1175 * failure.
1176 *
1177 */
1178static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
1179 struct genl_info *info)
1180{
1181 int ret_val;
1182 void *addr;
1183 void *mask;
1184 u32 addr_len;
1185 u32 secid;
1186 struct netlbl_audit audit_info;
1187
1188 /* Don't allow users to add both IPv4 and IPv6 addresses for a
1189 * single entry. However, allow users to create two entries, one each
1190 * for IPv4 and IPv6, with the same LSM security context which should
1191 * achieve the same result. */
1192 if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
1193 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1194 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1195 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1196 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1197 return -EINVAL;
1198
1199 netlbl_netlink_auditinfo(skb, &audit_info);
1200
1201 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1202 if (ret_val != 0)
1203 return ret_val;
1204 ret_val = security_secctx_to_secid(
1205 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1206 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
1207 &secid);
1208 if (ret_val != 0)
1209 return ret_val;
1210
1211 return netlbl_unlhsh_add(&init_net,
1212 NULL, addr, mask, addr_len, secid,
1213 &audit_info);
1214}
1215
1216/**
1217 * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
1218 * @skb: the NETLINK buffer
1219 * @info: the Generic NETLINK info block
1220 *
1221 * Description:
1222 * Process a user generated STATICREMOVE message and remove the specified
1223 * unlabeled connection entry. Returns zero on success, negative values on
1224 * failure.
1225 *
1226 */
1227static int netlbl_unlabel_staticremove(struct sk_buff *skb,
1228 struct genl_info *info)
1229{
1230 int ret_val;
1231 char *dev_name;
1232 void *addr;
1233 void *mask;
1234 u32 addr_len;
1235 struct netlbl_audit audit_info;
1236
1237 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1238 * IPv4 and IPv6 in the same entry. */
1239 if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
1240 !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1241 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1242 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1243 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1244 return -EINVAL;
1245
1246 netlbl_netlink_auditinfo(skb, &audit_info);
1247
1248 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1249 if (ret_val != 0)
1250 return ret_val;
1251 dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
1252
1253 return netlbl_unlhsh_remove(&init_net,
1254 dev_name, addr, mask, addr_len,
1255 &audit_info);
1256}
1257
1258/**
1259 * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
1260 * @skb: the NETLINK buffer
1261 * @info: the Generic NETLINK info block
1262 *
1263 * Description:
1264 * Process a user generated STATICREMOVEDEF message and remove the default
1265 * unlabeled connection entry. Returns zero on success, negative values on
1266 * failure.
1267 *
1268 */
1269static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
1270 struct genl_info *info)
1271{
1272 int ret_val;
1273 void *addr;
1274 void *mask;
1275 u32 addr_len;
1276 struct netlbl_audit audit_info;
1277
1278 /* See the note in netlbl_unlabel_staticadd() about not allowing both
1279 * IPv4 and IPv6 in the same entry. */
1280 if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
1281 !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
1282 (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
1283 !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
1284 return -EINVAL;
1285
1286 netlbl_netlink_auditinfo(skb, &audit_info);
1287
1288 ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
1289 if (ret_val != 0)
1290 return ret_val;
1291
1292 return netlbl_unlhsh_remove(&init_net,
1293 NULL, addr, mask, addr_len,
1294 &audit_info);
1295}
1296
1297
1298/**
1299 * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
1300 * @cmd: command/message
1301 * @iface: the interface entry
1302 * @addr4: the IPv4 address entry
1303 * @addr6: the IPv6 address entry
1304 * @arg: the netlbl_unlhsh_walk_arg structure
1305 *
1306 * Description:
1307 * This function is designed to be used to generate a response for a
1308 * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6
1309 * can be specified, not both, the other unspecified entry should be set to
1310 * NULL by the caller. Returns the size of the message on success, negative
1311 * values on failure.
1312 *
1313 */
1314static int netlbl_unlabel_staticlist_gen(u32 cmd,
1315 const struct netlbl_unlhsh_iface *iface,
1316 const struct netlbl_unlhsh_addr4 *addr4,
1317 const struct netlbl_unlhsh_addr6 *addr6,
1318 void *arg)
1319{
1320 int ret_val = -ENOMEM;
1321 struct netlbl_unlhsh_walk_arg *cb_arg = arg;
1322 struct net_device *dev;
1323 void *data;
1324 u32 secid;
1325 char *secctx;
1326 u32 secctx_len;
1327
1328 data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
1329 cb_arg->seq, &netlbl_unlabel_gnl_family,
1330 NLM_F_MULTI, cmd);
1331 if (data == NULL)
1332 goto list_cb_failure;
1333
1334 if (iface->ifindex > 0) {
1335 dev = dev_get_by_index(&init_net, iface->ifindex);
1336 ret_val = nla_put_string(cb_arg->skb,
1337 NLBL_UNLABEL_A_IFACE, dev->name);
1338 dev_put(dev);
1339 if (ret_val != 0)
1340 goto list_cb_failure;
1341 }
1342
1343 if (addr4) {
1344 struct in_addr addr_struct;
1345
1346 addr_struct.s_addr = addr4->addr;
1347 ret_val = nla_put(cb_arg->skb,
1348 NLBL_UNLABEL_A_IPV4ADDR,
1349 sizeof(struct in_addr),
1350 &addr_struct);
1351 if (ret_val != 0)
1352 goto list_cb_failure;
1353
1354 addr_struct.s_addr = addr4->mask;
1355 ret_val = nla_put(cb_arg->skb,
1356 NLBL_UNLABEL_A_IPV4MASK,
1357 sizeof(struct in_addr),
1358 &addr_struct);
1359 if (ret_val != 0)
1360 goto list_cb_failure;
1361
1362 secid = addr4->secid;
1363 } else {
1364 ret_val = nla_put(cb_arg->skb,
1365 NLBL_UNLABEL_A_IPV6ADDR,
1366 sizeof(struct in6_addr),
1367 &addr6->addr);
1368 if (ret_val != 0)
1369 goto list_cb_failure;
1370
1371 ret_val = nla_put(cb_arg->skb,
1372 NLBL_UNLABEL_A_IPV6MASK,
1373 sizeof(struct in6_addr),
1374 &addr6->mask);
1375 if (ret_val != 0)
1376 goto list_cb_failure;
1377
1378 secid = addr6->secid;
1379 }
1380
1381 ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
1382 if (ret_val != 0)
1383 goto list_cb_failure;
1384 ret_val = nla_put(cb_arg->skb,
1385 NLBL_UNLABEL_A_SECCTX,
1386 secctx_len,
1387 secctx);
1388 security_release_secctx(secctx, secctx_len);
1389 if (ret_val != 0)
1390 goto list_cb_failure;
1391
1392 cb_arg->seq++;
1393 return genlmsg_end(cb_arg->skb, data);
1394
1395list_cb_failure:
1396 genlmsg_cancel(cb_arg->skb, data);
1397 return ret_val;
1398}
1399
1400/**
1401 * netlbl_unlabel_staticlist - Handle a STATICLIST message
1402 * @skb: the NETLINK buffer
1403 * @cb: the NETLINK callback
1404 *
1405 * Description:
1406 * Process a user generated STATICLIST message and dump the unlabeled
1407 * connection hash table in a form suitable for use in a kernel generated
1408 * STATICLIST message. Returns the length of @skb.
1409 *
1410 */
1411static int netlbl_unlabel_staticlist(struct sk_buff *skb,
1412 struct netlink_callback *cb)
1413{
1414 struct netlbl_unlhsh_walk_arg cb_arg;
1415 u32 skip_bkt = cb->args[0];
1416 u32 skip_chain = cb->args[1];
1417 u32 skip_addr4 = cb->args[2];
1418 u32 skip_addr6 = cb->args[3];
1419 u32 iter_bkt;
1420 u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
1421 struct netlbl_unlhsh_iface *iface;
1422 struct netlbl_unlhsh_addr4 *addr4;
1423 struct netlbl_unlhsh_addr6 *addr6;
1424
1425 cb_arg.nl_cb = cb;
1426 cb_arg.skb = skb;
1427 cb_arg.seq = cb->nlh->nlmsg_seq;
1428
1429 rcu_read_lock();
1430 for (iter_bkt = skip_bkt;
1431 iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
1432 iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
1433 list_for_each_entry_rcu(iface,
1434 &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
1435 list) {
1436 if (!iface->valid ||
1437 iter_chain++ < skip_chain)
1438 continue;
1439 list_for_each_entry_rcu(addr4,
1440 &iface->addr4_list,
1441 list) {
1442 if (!addr4->valid || iter_addr4++ < skip_addr4)
1443 continue;
1444 if (netlbl_unlabel_staticlist_gen(
1445 NLBL_UNLABEL_C_STATICLIST,
1446 iface,
1447 addr4,
1448 NULL,
1449 &cb_arg) < 0) {
1450 iter_addr4--;
1451 iter_chain--;
1452 goto unlabel_staticlist_return;
1453 }
1454 }
1455 list_for_each_entry_rcu(addr6,
1456 &iface->addr6_list,
1457 list) {
1458 if (!addr6->valid || iter_addr6++ < skip_addr6)
1459 continue;
1460 if (netlbl_unlabel_staticlist_gen(
1461 NLBL_UNLABEL_C_STATICLIST,
1462 iface,
1463 NULL,
1464 addr6,
1465 &cb_arg) < 0) {
1466 iter_addr6--;
1467 iter_chain--;
1468 goto unlabel_staticlist_return;
1469 }
1470 }
1471 }
1472 }
1473
1474unlabel_staticlist_return:
1475 rcu_read_unlock();
1476 cb->args[0] = skip_bkt;
1477 cb->args[1] = skip_chain;
1478 cb->args[2] = skip_addr4;
1479 cb->args[3] = skip_addr6;
1480 return skb->len;
1481}
1482
1483/**
1484 * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
1485 * @skb: the NETLINK buffer
1486 * @cb: the NETLINK callback
1487 *
1488 * Description:
1489 * Process a user generated STATICLISTDEF message and dump the default
1490 * unlabeled connection entry in a form suitable for use in a kernel generated
1491 * STATICLISTDEF message. Returns the length of @skb.
1492 *
1493 */
1494static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
1495 struct netlink_callback *cb)
1496{
1497 struct netlbl_unlhsh_walk_arg cb_arg;
1498 struct netlbl_unlhsh_iface *iface;
1499 u32 skip_addr4 = cb->args[0];
1500 u32 skip_addr6 = cb->args[1];
1501 u32 iter_addr4 = 0, iter_addr6 = 0;
1502 struct netlbl_unlhsh_addr4 *addr4;
1503 struct netlbl_unlhsh_addr6 *addr6;
1504
1505 cb_arg.nl_cb = cb;
1506 cb_arg.skb = skb;
1507 cb_arg.seq = cb->nlh->nlmsg_seq;
1508
1509 rcu_read_lock();
1510 iface = rcu_dereference(netlbl_unlhsh_def);
1511 if (iface == NULL || !iface->valid)
1512 goto unlabel_staticlistdef_return;
1513
1514 list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
1515 if (!addr4->valid || iter_addr4++ < skip_addr4)
1516 continue;
1517 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1518 iface,
1519 addr4,
1520 NULL,
1521 &cb_arg) < 0) {
1522 iter_addr4--;
1523 goto unlabel_staticlistdef_return;
1524 }
1525 }
1526 list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
1527 if (addr6->valid || iter_addr6++ < skip_addr6)
1528 continue;
1529 if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
1530 iface,
1531 NULL,
1532 addr6,
1533 &cb_arg) < 0) {
1534 iter_addr6--;
1535 goto unlabel_staticlistdef_return;
1536 }
1537 }
1538
1539unlabel_staticlistdef_return:
1540 rcu_read_unlock();
1541 cb->args[0] = skip_addr4;
1542 cb->args[1] = skip_addr6;
1543 return skb->len;
1544}
178 1545
179/* 1546/*
180 * NetLabel Generic NETLINK Command Definitions 1547 * NetLabel Generic NETLINK Command Definitions
181 */ 1548 */
182 1549
1550static struct genl_ops netlbl_unlabel_genl_c_staticadd = {
1551 .cmd = NLBL_UNLABEL_C_STATICADD,
1552 .flags = GENL_ADMIN_PERM,
1553 .policy = netlbl_unlabel_genl_policy,
1554 .doit = netlbl_unlabel_staticadd,
1555 .dumpit = NULL,
1556};
1557
1558static struct genl_ops netlbl_unlabel_genl_c_staticremove = {
1559 .cmd = NLBL_UNLABEL_C_STATICREMOVE,
1560 .flags = GENL_ADMIN_PERM,
1561 .policy = netlbl_unlabel_genl_policy,
1562 .doit = netlbl_unlabel_staticremove,
1563 .dumpit = NULL,
1564};
1565
1566static struct genl_ops netlbl_unlabel_genl_c_staticlist = {
1567 .cmd = NLBL_UNLABEL_C_STATICLIST,
1568 .flags = 0,
1569 .policy = netlbl_unlabel_genl_policy,
1570 .doit = NULL,
1571 .dumpit = netlbl_unlabel_staticlist,
1572};
1573
1574static struct genl_ops netlbl_unlabel_genl_c_staticadddef = {
1575 .cmd = NLBL_UNLABEL_C_STATICADDDEF,
1576 .flags = GENL_ADMIN_PERM,
1577 .policy = netlbl_unlabel_genl_policy,
1578 .doit = netlbl_unlabel_staticadddef,
1579 .dumpit = NULL,
1580};
1581
1582static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = {
1583 .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
1584 .flags = GENL_ADMIN_PERM,
1585 .policy = netlbl_unlabel_genl_policy,
1586 .doit = netlbl_unlabel_staticremovedef,
1587 .dumpit = NULL,
1588};
1589
1590static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = {
1591 .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
1592 .flags = 0,
1593 .policy = netlbl_unlabel_genl_policy,
1594 .doit = NULL,
1595 .dumpit = netlbl_unlabel_staticlistdef,
1596};
1597
183static struct genl_ops netlbl_unlabel_genl_c_accept = { 1598static struct genl_ops netlbl_unlabel_genl_c_accept = {
184 .cmd = NLBL_UNLABEL_C_ACCEPT, 1599 .cmd = NLBL_UNLABEL_C_ACCEPT,
185 .flags = GENL_ADMIN_PERM, 1600 .flags = GENL_ADMIN_PERM,
@@ -196,7 +1611,6 @@ static struct genl_ops netlbl_unlabel_genl_c_list = {
196 .dumpit = NULL, 1611 .dumpit = NULL,
197}; 1612};
198 1613
199
200/* 1614/*
201 * NetLabel Generic NETLINK Protocol Functions 1615 * NetLabel Generic NETLINK Protocol Functions
202 */ 1616 */
@@ -218,6 +1632,36 @@ int netlbl_unlabel_genl_init(void)
218 return ret_val; 1632 return ret_val;
219 1633
220 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, 1634 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1635 &netlbl_unlabel_genl_c_staticadd);
1636 if (ret_val != 0)
1637 return ret_val;
1638
1639 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1640 &netlbl_unlabel_genl_c_staticremove);
1641 if (ret_val != 0)
1642 return ret_val;
1643
1644 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1645 &netlbl_unlabel_genl_c_staticlist);
1646 if (ret_val != 0)
1647 return ret_val;
1648
1649 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1650 &netlbl_unlabel_genl_c_staticadddef);
1651 if (ret_val != 0)
1652 return ret_val;
1653
1654 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1655 &netlbl_unlabel_genl_c_staticremovedef);
1656 if (ret_val != 0)
1657 return ret_val;
1658
1659 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
1660 &netlbl_unlabel_genl_c_staticlistdef);
1661 if (ret_val != 0)
1662 return ret_val;
1663
1664 ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
221 &netlbl_unlabel_genl_c_accept); 1665 &netlbl_unlabel_genl_c_accept);
222 if (ret_val != 0) 1666 if (ret_val != 0)
223 return ret_val; 1667 return ret_val;
@@ -234,8 +1678,58 @@ int netlbl_unlabel_genl_init(void)
234 * NetLabel KAPI Hooks 1678 * NetLabel KAPI Hooks
235 */ 1679 */
236 1680
1681static struct notifier_block netlbl_unlhsh_netdev_notifier = {
1682 .notifier_call = netlbl_unlhsh_netdev_handler,
1683};
1684
1685/**
1686 * netlbl_unlabel_init - Initialize the unlabeled connection hash table
1687 * @size: the number of bits to use for the hash buckets
1688 *
1689 * Description:
1690 * Initializes the unlabeled connection hash table and registers a network
1691 * device notification handler. This function should only be called by the
1692 * NetLabel subsystem itself during initialization. Returns zero on success,
1693 * non-zero values on error.
1694 *
1695 */
1696int netlbl_unlabel_init(u32 size)
1697{
1698 u32 iter;
1699 struct netlbl_unlhsh_tbl *hsh_tbl;
1700
1701 if (size == 0)
1702 return -EINVAL;
1703
1704 hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
1705 if (hsh_tbl == NULL)
1706 return -ENOMEM;
1707 hsh_tbl->size = 1 << size;
1708 hsh_tbl->tbl = kcalloc(hsh_tbl->size,
1709 sizeof(struct list_head),
1710 GFP_KERNEL);
1711 if (hsh_tbl->tbl == NULL) {
1712 kfree(hsh_tbl);
1713 return -ENOMEM;
1714 }
1715 for (iter = 0; iter < hsh_tbl->size; iter++)
1716 INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
1717
1718 rcu_read_lock();
1719 spin_lock(&netlbl_unlhsh_lock);
1720 rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
1721 spin_unlock(&netlbl_unlhsh_lock);
1722 rcu_read_unlock();
1723
1724 register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
1725
1726 return 0;
1727}
1728
237/** 1729/**
238 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet 1730 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
1731 * @skb: the packet
1732 * @family: protocol family
239 * @secattr: the security attributes 1733 * @secattr: the security attributes
240 * 1734 *
241 * Description: 1735 * Description:
@@ -243,19 +1737,52 @@ int netlbl_unlabel_genl_init(void)
243 * them in @secattr. Returns zero on success and negative values on failure. 1737 * them in @secattr. Returns zero on success and negative values on failure.
244 * 1738 *
245 */ 1739 */
246int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr) 1740int netlbl_unlabel_getattr(const struct sk_buff *skb,
1741 u16 family,
1742 struct netlbl_lsm_secattr *secattr)
247{ 1743{
248 int ret_val; 1744 struct iphdr *hdr4;
1745 struct ipv6hdr *hdr6;
1746 struct netlbl_unlhsh_addr4 *addr4;
1747 struct netlbl_unlhsh_addr6 *addr6;
1748 struct netlbl_unlhsh_iface *iface;
249 1749
250 rcu_read_lock(); 1750 rcu_read_lock();
251 if (netlabel_unlabel_acceptflg == 1) { 1751 iface = netlbl_unlhsh_search_iface_def(skb->iif);
252 netlbl_secattr_init(secattr); 1752 if (iface == NULL)
253 ret_val = 0; 1753 goto unlabel_getattr_nolabel;
254 } else 1754 switch (family) {
255 ret_val = -ENOMSG; 1755 case PF_INET:
1756 hdr4 = ip_hdr(skb);
1757 addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
1758 if (addr4 == NULL)
1759 goto unlabel_getattr_nolabel;
1760 secattr->attr.secid = addr4->secid;
1761 break;
1762#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1763 case PF_INET6:
1764 hdr6 = ipv6_hdr(skb);
1765 addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
1766 if (addr6 == NULL)
1767 goto unlabel_getattr_nolabel;
1768 secattr->attr.secid = addr6->secid;
1769 break;
1770#endif /* IPv6 */
1771 default:
1772 goto unlabel_getattr_nolabel;
1773 }
256 rcu_read_unlock(); 1774 rcu_read_unlock();
257 1775
258 return ret_val; 1776 secattr->flags |= NETLBL_SECATTR_SECID;
1777 secattr->type = NETLBL_NLTYPE_UNLABELED;
1778 return 0;
1779
1780unlabel_getattr_nolabel:
1781 rcu_read_unlock();
1782 if (netlabel_unlabel_acceptflg == 0)
1783 return -ENOMSG;
1784 secattr->type = NETLBL_NLTYPE_UNLABELED;
1785 return 0;
259} 1786}
260 1787
261/** 1788/**
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
index c2917fbb42cf..06b1301ac072 100644
--- a/net/netlabel/netlabel_unlabeled.h
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -36,6 +36,116 @@
36/* 36/*
37 * The following NetLabel payloads are supported by the Unlabeled subsystem. 37 * The following NetLabel payloads are supported by the Unlabeled subsystem.
38 * 38 *
39 * o STATICADD
40 * This message is sent from an application to add a new static label for
41 * incoming unlabeled connections.
42 *
43 * Required attributes:
44 *
45 * NLBL_UNLABEL_A_IFACE
46 * NLBL_UNLABEL_A_SECCTX
47 *
48 * If IPv4 is specified the following attributes are required:
49 *
50 * NLBL_UNLABEL_A_IPV4ADDR
51 * NLBL_UNLABEL_A_IPV4MASK
52 *
53 * If IPv6 is specified the following attributes are required:
54 *
55 * NLBL_UNLABEL_A_IPV6ADDR
56 * NLBL_UNLABEL_A_IPV6MASK
57 *
58 * o STATICREMOVE
59 * This message is sent from an application to remove an existing static
60 * label for incoming unlabeled connections.
61 *
62 * Required attributes:
63 *
64 * NLBL_UNLABEL_A_IFACE
65 *
66 * If IPv4 is specified the following attributes are required:
67 *
68 * NLBL_UNLABEL_A_IPV4ADDR
69 * NLBL_UNLABEL_A_IPV4MASK
70 *
71 * If IPv6 is specified the following attributes are required:
72 *
73 * NLBL_UNLABEL_A_IPV6ADDR
74 * NLBL_UNLABEL_A_IPV6MASK
75 *
76 * o STATICLIST
77 * This message can be sent either from an application or by the kernel in
78 * response to an application generated STATICLIST message. When sent by an
79 * application there is no payload and the NLM_F_DUMP flag should be set.
80 * The kernel should response with a series of the following messages.
81 *
82 * Required attributes:
83 *
84 * NLBL_UNLABEL_A_IFACE
85 * NLBL_UNLABEL_A_SECCTX
86 *
87 * If IPv4 is specified the following attributes are required:
88 *
89 * NLBL_UNLABEL_A_IPV4ADDR
90 * NLBL_UNLABEL_A_IPV4MASK
91 *
92 * If IPv6 is specified the following attributes are required:
93 *
94 * NLBL_UNLABEL_A_IPV6ADDR
95 * NLBL_UNLABEL_A_IPV6MASK
96 *
97 * o STATICADDDEF
98 * This message is sent from an application to set the default static
99 * label for incoming unlabeled connections.
100 *
101 * Required attribute:
102 *
103 * NLBL_UNLABEL_A_SECCTX
104 *
105 * If IPv4 is specified the following attributes are required:
106 *
107 * NLBL_UNLABEL_A_IPV4ADDR
108 * NLBL_UNLABEL_A_IPV4MASK
109 *
110 * If IPv6 is specified the following attributes are required:
111 *
112 * NLBL_UNLABEL_A_IPV6ADDR
113 * NLBL_UNLABEL_A_IPV6MASK
114 *
115 * o STATICREMOVEDEF
116 * This message is sent from an application to remove the existing default
117 * static label for incoming unlabeled connections.
118 *
119 * If IPv4 is specified the following attributes are required:
120 *
121 * NLBL_UNLABEL_A_IPV4ADDR
122 * NLBL_UNLABEL_A_IPV4MASK
123 *
124 * If IPv6 is specified the following attributes are required:
125 *
126 * NLBL_UNLABEL_A_IPV6ADDR
127 * NLBL_UNLABEL_A_IPV6MASK
128 *
129 * o STATICLISTDEF
130 * This message can be sent either from an application or by the kernel in
131 * response to an application generated STATICLISTDEF message. When sent by
132 * an application there is no payload and the NLM_F_DUMP flag should be set.
133 * The kernel should response with the following message.
134 *
135 * Required attribute:
136 *
137 * NLBL_UNLABEL_A_SECCTX
138 *
139 * If IPv4 is specified the following attributes are required:
140 *
141 * NLBL_UNLABEL_A_IPV4ADDR
142 * NLBL_UNLABEL_A_IPV4MASK
143 *
144 * If IPv6 is specified the following attributes are required:
145 *
146 * NLBL_UNLABEL_A_IPV6ADDR
147 * NLBL_UNLABEL_A_IPV6MASK
148 *
39 * o ACCEPT 149 * o ACCEPT
40 * This message is sent from an application to specify if the kernel should 150 * This message is sent from an application to specify if the kernel should
41 * allow unlabled packets to pass if they do not match any of the static 151 * allow unlabled packets to pass if they do not match any of the static
@@ -62,6 +172,12 @@ enum {
62 NLBL_UNLABEL_C_UNSPEC, 172 NLBL_UNLABEL_C_UNSPEC,
63 NLBL_UNLABEL_C_ACCEPT, 173 NLBL_UNLABEL_C_ACCEPT,
64 NLBL_UNLABEL_C_LIST, 174 NLBL_UNLABEL_C_LIST,
175 NLBL_UNLABEL_C_STATICADD,
176 NLBL_UNLABEL_C_STATICREMOVE,
177 NLBL_UNLABEL_C_STATICLIST,
178 NLBL_UNLABEL_C_STATICADDDEF,
179 NLBL_UNLABEL_C_STATICREMOVEDEF,
180 NLBL_UNLABEL_C_STATICLISTDEF,
65 __NLBL_UNLABEL_C_MAX, 181 __NLBL_UNLABEL_C_MAX,
66}; 182};
67#define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) 183#define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
@@ -73,6 +189,24 @@ enum {
73 /* (NLA_U8) 189 /* (NLA_U8)
74 * if true then unlabeled packets are allowed to pass, else unlabeled 190 * if true then unlabeled packets are allowed to pass, else unlabeled
75 * packets are rejected */ 191 * packets are rejected */
192 NLBL_UNLABEL_A_IPV6ADDR,
193 /* (NLA_BINARY, struct in6_addr)
194 * an IPv6 address */
195 NLBL_UNLABEL_A_IPV6MASK,
196 /* (NLA_BINARY, struct in6_addr)
197 * an IPv6 address mask */
198 NLBL_UNLABEL_A_IPV4ADDR,
199 /* (NLA_BINARY, struct in_addr)
200 * an IPv4 address */
201 NLBL_UNLABEL_A_IPV4MASK,
202 /* (NLA_BINARY, struct in_addr)
203 * and IPv4 address mask */
204 NLBL_UNLABEL_A_IFACE,
205 /* (NLA_NULL_STRING)
206 * network interface */
207 NLBL_UNLABEL_A_SECCTX,
208 /* (NLA_BINARY)
209 * a LSM specific security context */
76 __NLBL_UNLABEL_A_MAX, 210 __NLBL_UNLABEL_A_MAX,
77}; 211};
78#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) 212#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
@@ -80,8 +214,17 @@ enum {
80/* NetLabel protocol functions */ 214/* NetLabel protocol functions */
81int netlbl_unlabel_genl_init(void); 215int netlbl_unlabel_genl_init(void);
82 216
217/* Unlabeled connection hash table size */
218/* XXX - currently this number is an uneducated guess */
219#define NETLBL_UNLHSH_BITSIZE 7
220
221/* General Unlabeled init function */
222int netlbl_unlabel_init(u32 size);
223
83/* Process Unlabeled incoming network packets */ 224/* Process Unlabeled incoming network packets */
84int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr); 225int netlbl_unlabel_getattr(const struct sk_buff *skb,
226 u16 family,
227 struct netlbl_lsm_secattr *secattr);
85 228
86/* Set the default configuration to allow Unlabeled packets */ 229/* Set the default configuration to allow Unlabeled packets */
87int netlbl_unlabel_defconf(void); 230int netlbl_unlabel_defconf(void);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a4a6bf7deaa4..4ad5fbbb18b4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/sched.h>
21 22
22#include <linux/sunrpc/types.h> 23#include <linux/sunrpc/types.h>
23#include <linux/sunrpc/xdr.h> 24#include <linux/sunrpc/xdr.h>
diff --git a/security/Kconfig b/security/Kconfig
index 8086e61058e3..389e151e3b68 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -76,6 +76,7 @@ config SECURITY_NETWORK_XFRM
76config SECURITY_CAPABILITIES 76config SECURITY_CAPABILITIES
77 bool "Default Linux Capabilities" 77 bool "Default Linux Capabilities"
78 depends on SECURITY 78 depends on SECURITY
79 default y
79 help 80 help
80 This enables the "default" Linux capabilities functionality. 81 This enables the "default" Linux capabilities functionality.
81 If you are unsure how to answer this question, answer Y. 82 If you are unsure how to answer this question, answer Y.
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index b32a459c0683..2b517d618672 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX
145config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE 145config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
146 int "NSA SELinux maximum supported policy format version value" 146 int "NSA SELinux maximum supported policy format version value"
147 depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX 147 depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
148 range 15 21 148 range 15 22
149 default 19 149 default 19
150 help 150 help
151 This option sets the value for the maximum policy format version 151 This option sets the value for the maximum policy format version
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index dc3502e30b19..00afd85f1edb 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -4,7 +4,14 @@
4 4
5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ 5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
6 6
7selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o 7selinux-y := avc.o \
8 hooks.o \
9 selinuxfs.o \
10 netlink.o \
11 nlmsgtab.o \
12 netif.o \
13 netnode.o \
14 exports.o
8 15
9selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o 16selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
10 17
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 81b3dff3cbf0..e8529e2f51e5 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -661,9 +661,18 @@ void avc_audit(u32 ssid, u32 tsid,
661 "daddr", "dest"); 661 "daddr", "dest");
662 break; 662 break;
663 } 663 }
664 if (a->u.net.netif) 664 if (a->u.net.netif > 0) {
665 audit_log_format(ab, " netif=%s", 665 struct net_device *dev;
666 a->u.net.netif); 666
667 /* NOTE: we always use init's namespace */
668 dev = dev_get_by_index(&init_net,
669 a->u.net.netif);
670 if (dev) {
671 audit_log_format(ab, " netif=%s",
672 dev->name);
673 dev_put(dev);
674 }
675 }
667 break; 676 break;
668 } 677 }
669 } 678 }
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index b6f96943be1f..87d2bb3ea355 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -17,10 +17,14 @@
17#include <linux/selinux.h> 17#include <linux/selinux.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/ipc.h> 19#include <linux/ipc.h>
20#include <asm/atomic.h>
20 21
21#include "security.h" 22#include "security.h"
22#include "objsec.h" 23#include "objsec.h"
23 24
25/* SECMARK reference count */
26extern atomic_t selinux_secmark_refcount;
27
24int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) 28int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
25{ 29{
26 if (selinux_enabled) 30 if (selinux_enabled)
@@ -74,7 +78,7 @@ int selinux_string_to_sid(char *str, u32 *sid)
74} 78}
75EXPORT_SYMBOL_GPL(selinux_string_to_sid); 79EXPORT_SYMBOL_GPL(selinux_string_to_sid);
76 80
77int selinux_relabel_packet_permission(u32 sid) 81int selinux_secmark_relabel_packet_permission(u32 sid)
78{ 82{
79 if (selinux_enabled) { 83 if (selinux_enabled) {
80 struct task_security_struct *tsec = current->security; 84 struct task_security_struct *tsec = current->security;
@@ -84,4 +88,16 @@ int selinux_relabel_packet_permission(u32 sid)
84 } 88 }
85 return 0; 89 return 0;
86} 90}
87EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission); 91EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
92
93void selinux_secmark_refcount_inc(void)
94{
95 atomic_inc(&selinux_secmark_refcount);
96}
97EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
98
99void selinux_secmark_refcount_dec(void)
100{
101 atomic_dec(&selinux_secmark_refcount);
102}
103EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 64d414efb404..be6de0b8734f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -12,8 +12,8 @@
12 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 12 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
13 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. 13 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
14 * <dgoeddel@trustedcs.com> 14 * <dgoeddel@trustedcs.com>
15 * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. 15 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
16 * Paul Moore, <paul.moore@hp.com> 16 * Paul Moore <paul.moore@hp.com>
17 * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. 17 * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
18 * Yuichi Nakamura <ynakam@hitachisoft.jp> 18 * Yuichi Nakamura <ynakam@hitachisoft.jp>
19 * 19 *
@@ -50,8 +50,11 @@
50#include <net/icmp.h> 50#include <net/icmp.h>
51#include <net/ip.h> /* for local_port_range[] */ 51#include <net/ip.h> /* for local_port_range[] */
52#include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ 52#include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */
53#include <net/net_namespace.h>
54#include <net/netlabel.h>
53#include <asm/uaccess.h> 55#include <asm/uaccess.h>
54#include <asm/ioctls.h> 56#include <asm/ioctls.h>
57#include <asm/atomic.h>
55#include <linux/bitops.h> 58#include <linux/bitops.h>
56#include <linux/interrupt.h> 59#include <linux/interrupt.h>
57#include <linux/netdevice.h> /* for network interface checks */ 60#include <linux/netdevice.h> /* for network interface checks */
@@ -76,6 +79,7 @@
76#include "avc.h" 79#include "avc.h"
77#include "objsec.h" 80#include "objsec.h"
78#include "netif.h" 81#include "netif.h"
82#include "netnode.h"
79#include "xfrm.h" 83#include "xfrm.h"
80#include "netlabel.h" 84#include "netlabel.h"
81 85
@@ -89,6 +93,9 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);
89extern int selinux_compat_net; 93extern int selinux_compat_net;
90extern struct security_operations *security_ops; 94extern struct security_operations *security_ops;
91 95
96/* SECMARK reference count */
97atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);
98
92#ifdef CONFIG_SECURITY_SELINUX_DEVELOP 99#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
93int selinux_enforcing = 0; 100int selinux_enforcing = 0;
94 101
@@ -155,6 +162,21 @@ getsecurity_exit:
155 return len; 162 return len;
156} 163}
157 164
165/**
166 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
167 *
168 * Description:
169 * This function checks the SECMARK reference counter to see if any SECMARK
170 * targets are currently configured, if the reference counter is greater than
171 * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is
172 * enabled, false (0) if SECMARK is disabled.
173 *
174 */
175static int selinux_secmark_enabled(void)
176{
177 return (atomic_read(&selinux_secmark_refcount) > 0);
178}
179
158/* Allocate and free functions for each kind of security blob. */ 180/* Allocate and free functions for each kind of security blob. */
159 181
160static int task_alloc_security(struct task_struct *task) 182static int task_alloc_security(struct task_struct *task)
@@ -561,8 +583,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag,
561 * Allow filesystems with binary mount data to explicitly set mount point 583 * Allow filesystems with binary mount data to explicitly set mount point
562 * labeling information. 584 * labeling information.
563 */ 585 */
564int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, 586static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
565 int *flags, int num_opts) 587 int *flags, int num_opts)
566{ 588{
567 int rc = 0, i; 589 int rc = 0, i;
568 struct task_security_struct *tsec = current->security; 590 struct task_security_struct *tsec = current->security;
@@ -3395,7 +3417,7 @@ out:
3395#endif /* IPV6 */ 3417#endif /* IPV6 */
3396 3418
3397static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, 3419static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3398 char **addrp, int *len, int src, u8 *proto) 3420 char **addrp, int src, u8 *proto)
3399{ 3421{
3400 int ret = 0; 3422 int ret = 0;
3401 3423
@@ -3404,7 +3426,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3404 ret = selinux_parse_skb_ipv4(skb, ad, proto); 3426 ret = selinux_parse_skb_ipv4(skb, ad, proto);
3405 if (ret || !addrp) 3427 if (ret || !addrp)
3406 break; 3428 break;
3407 *len = 4;
3408 *addrp = (char *)(src ? &ad->u.net.v4info.saddr : 3429 *addrp = (char *)(src ? &ad->u.net.v4info.saddr :
3409 &ad->u.net.v4info.daddr); 3430 &ad->u.net.v4info.daddr);
3410 break; 3431 break;
@@ -3414,7 +3435,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3414 ret = selinux_parse_skb_ipv6(skb, ad, proto); 3435 ret = selinux_parse_skb_ipv6(skb, ad, proto);
3415 if (ret || !addrp) 3436 if (ret || !addrp)
3416 break; 3437 break;
3417 *len = 16;
3418 *addrp = (char *)(src ? &ad->u.net.v6info.saddr : 3438 *addrp = (char *)(src ? &ad->u.net.v6info.saddr :
3419 &ad->u.net.v6info.daddr); 3439 &ad->u.net.v6info.daddr);
3420 break; 3440 break;
@@ -3423,36 +3443,48 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
3423 break; 3443 break;
3424 } 3444 }
3425 3445
3446 if (unlikely(ret))
3447 printk(KERN_WARNING
3448 "SELinux: failure in selinux_parse_skb(),"
3449 " unable to parse packet\n");
3450
3426 return ret; 3451 return ret;
3427} 3452}
3428 3453
3429/** 3454/**
3430 * selinux_skb_extlbl_sid - Determine the external label of a packet 3455 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
3431 * @skb: the packet 3456 * @skb: the packet
3432 * @sid: the packet's SID 3457 * @family: protocol family
3458 * @sid: the packet's peer label SID
3433 * 3459 *
3434 * Description: 3460 * Description:
3435 * Check the various different forms of external packet labeling and determine 3461 * Check the various different forms of network peer labeling and determine
3436 * the external SID for the packet. If only one form of external labeling is 3462 * the peer label/SID for the packet; most of the magic actually occurs in
3437 * present then it is used, if both labeled IPsec and NetLabel labels are 3463 * the security server function security_net_peersid_cmp(). The function
3438 * present then the SELinux type information is taken from the labeled IPsec 3464 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
3439 * SA and the MLS sensitivity label information is taken from the NetLabel 3465 * or -EACCES if @sid is invalid due to inconsistencies with the different
3440 * security attributes. This bit of "magic" is done in the call to 3466 * peer labels.
3441 * selinux_netlbl_skbuff_getsid().
3442 * 3467 *
3443 */ 3468 */
3444static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid) 3469static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
3445{ 3470{
3471 int err;
3446 u32 xfrm_sid; 3472 u32 xfrm_sid;
3447 u32 nlbl_sid; 3473 u32 nlbl_sid;
3474 u32 nlbl_type;
3448 3475
3449 selinux_skb_xfrm_sid(skb, &xfrm_sid); 3476 selinux_skb_xfrm_sid(skb, &xfrm_sid);
3450 if (selinux_netlbl_skbuff_getsid(skb, 3477 selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
3451 (xfrm_sid == SECSID_NULL ? 3478
3452 SECINITSID_NETMSG : xfrm_sid), 3479 err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
3453 &nlbl_sid) != 0) 3480 if (unlikely(err)) {
3454 nlbl_sid = SECSID_NULL; 3481 printk(KERN_WARNING
3455 *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); 3482 "SELinux: failure in selinux_skb_peerlbl_sid(),"
3483 " unable to determine packet's peer label\n");
3484 return -EACCES;
3485 }
3486
3487 return 0;
3456} 3488}
3457 3489
3458/* socket security operations */ 3490/* socket security operations */
@@ -3518,6 +3550,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
3518 if (sock->sk) { 3550 if (sock->sk) {
3519 sksec = sock->sk->sk_security; 3551 sksec = sock->sk->sk_security;
3520 sksec->sid = isec->sid; 3552 sksec->sid = isec->sid;
3553 sksec->sclass = isec->sclass;
3521 err = selinux_netlbl_socket_post_create(sock); 3554 err = selinux_netlbl_socket_post_create(sock);
3522 } 3555 }
3523 3556
@@ -3610,7 +3643,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
3610 break; 3643 break;
3611 } 3644 }
3612 3645
3613 err = security_node_sid(family, addrp, addrlen, &sid); 3646 err = sel_netnode_sid(addrp, family, &sid);
3614 if (err) 3647 if (err)
3615 goto out; 3648 goto out;
3616 3649
@@ -3821,131 +3854,182 @@ static int selinux_socket_unix_may_send(struct socket *sock,
3821 return 0; 3854 return 0;
3822} 3855}
3823 3856
3824static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, 3857static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
3825 struct avc_audit_data *ad, u16 family, char *addrp, int len) 3858 u32 peer_sid,
3859 struct avc_audit_data *ad)
3826{ 3860{
3827 int err = 0; 3861 int err;
3828 u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0; 3862 u32 if_sid;
3829 struct socket *sock; 3863 u32 node_sid;
3830 u16 sock_class = 0;
3831 u32 sock_sid = 0;
3832
3833 read_lock_bh(&sk->sk_callback_lock);
3834 sock = sk->sk_socket;
3835 if (sock) {
3836 struct inode *inode;
3837 inode = SOCK_INODE(sock);
3838 if (inode) {
3839 struct inode_security_struct *isec;
3840 isec = inode->i_security;
3841 sock_sid = isec->sid;
3842 sock_class = isec->sclass;
3843 }
3844 }
3845 read_unlock_bh(&sk->sk_callback_lock);
3846 if (!sock_sid)
3847 goto out;
3848 3864
3849 if (!skb->dev) 3865 err = sel_netif_sid(ifindex, &if_sid);
3850 goto out; 3866 if (err)
3867 return err;
3868 err = avc_has_perm(peer_sid, if_sid,
3869 SECCLASS_NETIF, NETIF__INGRESS, ad);
3870 if (err)
3871 return err;
3851 3872
3852 err = sel_netif_sids(skb->dev, &if_sid, NULL); 3873 err = sel_netnode_sid(addrp, family, &node_sid);
3853 if (err) 3874 if (err)
3854 goto out; 3875 return err;
3876 return avc_has_perm(peer_sid, node_sid,
3877 SECCLASS_NODE, NODE__RECVFROM, ad);
3878}
3879
3880static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk,
3881 struct sk_buff *skb,
3882 struct avc_audit_data *ad,
3883 u16 family,
3884 char *addrp)
3885{
3886 int err;
3887 struct sk_security_struct *sksec = sk->sk_security;
3888 u16 sk_class;
3889 u32 netif_perm, node_perm, recv_perm;
3890 u32 port_sid, node_sid, if_sid, sk_sid;
3855 3891
3856 switch (sock_class) { 3892 sk_sid = sksec->sid;
3893 sk_class = sksec->sclass;
3894
3895 switch (sk_class) {
3857 case SECCLASS_UDP_SOCKET: 3896 case SECCLASS_UDP_SOCKET:
3858 netif_perm = NETIF__UDP_RECV; 3897 netif_perm = NETIF__UDP_RECV;
3859 node_perm = NODE__UDP_RECV; 3898 node_perm = NODE__UDP_RECV;
3860 recv_perm = UDP_SOCKET__RECV_MSG; 3899 recv_perm = UDP_SOCKET__RECV_MSG;
3861 break; 3900 break;
3862
3863 case SECCLASS_TCP_SOCKET: 3901 case SECCLASS_TCP_SOCKET:
3864 netif_perm = NETIF__TCP_RECV; 3902 netif_perm = NETIF__TCP_RECV;
3865 node_perm = NODE__TCP_RECV; 3903 node_perm = NODE__TCP_RECV;
3866 recv_perm = TCP_SOCKET__RECV_MSG; 3904 recv_perm = TCP_SOCKET__RECV_MSG;
3867 break; 3905 break;
3868
3869 case SECCLASS_DCCP_SOCKET: 3906 case SECCLASS_DCCP_SOCKET:
3870 netif_perm = NETIF__DCCP_RECV; 3907 netif_perm = NETIF__DCCP_RECV;
3871 node_perm = NODE__DCCP_RECV; 3908 node_perm = NODE__DCCP_RECV;
3872 recv_perm = DCCP_SOCKET__RECV_MSG; 3909 recv_perm = DCCP_SOCKET__RECV_MSG;
3873 break; 3910 break;
3874
3875 default: 3911 default:
3876 netif_perm = NETIF__RAWIP_RECV; 3912 netif_perm = NETIF__RAWIP_RECV;
3877 node_perm = NODE__RAWIP_RECV; 3913 node_perm = NODE__RAWIP_RECV;
3914 recv_perm = 0;
3878 break; 3915 break;
3879 } 3916 }
3880 3917
3881 err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); 3918 err = sel_netif_sid(skb->iif, &if_sid);
3882 if (err) 3919 if (err)
3883 goto out; 3920 return err;
3884 3921 err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
3885 err = security_node_sid(family, addrp, len, &node_sid);
3886 if (err) 3922 if (err)
3887 goto out; 3923 return err;
3888 3924
3889 err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad); 3925 err = sel_netnode_sid(addrp, family, &node_sid);
3890 if (err) 3926 if (err)
3891 goto out; 3927 return err;
3928 err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
3929 if (err)
3930 return err;
3892 3931
3893 if (recv_perm) { 3932 if (!recv_perm)
3894 u32 port_sid; 3933 return 0;
3934 err = security_port_sid(sk->sk_family, sk->sk_type,
3935 sk->sk_protocol, ntohs(ad->u.net.sport),
3936 &port_sid);
3937 if (unlikely(err)) {
3938 printk(KERN_WARNING
3939 "SELinux: failure in"
3940 " selinux_sock_rcv_skb_iptables_compat(),"
3941 " network port label not found\n");
3942 return err;
3943 }
3944 return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad);
3945}
3895 3946
3896 err = security_port_sid(sk->sk_family, sk->sk_type, 3947static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
3897 sk->sk_protocol, ntohs(ad->u.net.sport), 3948 struct avc_audit_data *ad,
3898 &port_sid); 3949 u16 family, char *addrp)
3899 if (err) 3950{
3900 goto out; 3951 int err;
3952 struct sk_security_struct *sksec = sk->sk_security;
3953 u32 peer_sid;
3954 u32 sk_sid = sksec->sid;
3901 3955
3902 err = avc_has_perm(sock_sid, port_sid, 3956 if (selinux_compat_net)
3903 sock_class, recv_perm, ad); 3957 err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
3958 family, addrp);
3959 else
3960 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
3961 PACKET__RECV, ad);
3962 if (err)
3963 return err;
3964
3965 if (selinux_policycap_netpeer) {
3966 err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
3967 if (err)
3968 return err;
3969 err = avc_has_perm(sk_sid, peer_sid,
3970 SECCLASS_PEER, PEER__RECV, ad);
3971 } else {
3972 err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
3973 if (err)
3974 return err;
3975 err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
3904 } 3976 }
3905 3977
3906out:
3907 return err; 3978 return err;
3908} 3979}
3909 3980
3910static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) 3981static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
3911{ 3982{
3912 u16 family; 3983 int err;
3913 char *addrp;
3914 int len, err = 0;
3915 struct avc_audit_data ad;
3916 struct sk_security_struct *sksec = sk->sk_security; 3984 struct sk_security_struct *sksec = sk->sk_security;
3985 u16 family = sk->sk_family;
3986 u32 sk_sid = sksec->sid;
3987 struct avc_audit_data ad;
3988 char *addrp;
3917 3989
3918 family = sk->sk_family;
3919 if (family != PF_INET && family != PF_INET6) 3990 if (family != PF_INET && family != PF_INET6)
3920 goto out; 3991 return 0;
3921 3992
3922 /* Handle mapped IPv4 packets arriving via IPv6 sockets */ 3993 /* Handle mapped IPv4 packets arriving via IPv6 sockets */
3923 if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) 3994 if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
3924 family = PF_INET; 3995 family = PF_INET;
3925 3996
3926 AVC_AUDIT_DATA_INIT(&ad, NET); 3997 AVC_AUDIT_DATA_INIT(&ad, NET);
3927 ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]"; 3998 ad.u.net.netif = skb->iif;
3928 ad.u.net.family = family; 3999 ad.u.net.family = family;
3929 4000 err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
3930 err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL);
3931 if (err) 4001 if (err)
3932 goto out; 4002 return err;
3933 4003
3934 if (selinux_compat_net) 4004 /* If any sort of compatibility mode is enabled then handoff processing
3935 err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family, 4005 * to the selinux_sock_rcv_skb_compat() function to deal with the
3936 addrp, len); 4006 * special handling. We do this in an attempt to keep this function
3937 else 4007 * as fast and as clean as possible. */
3938 err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, 4008 if (selinux_compat_net || !selinux_policycap_netpeer)
3939 PACKET__RECV, &ad); 4009 return selinux_sock_rcv_skb_compat(sk, skb, &ad,
3940 if (err) 4010 family, addrp);
3941 goto out;
3942 4011
3943 err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad); 4012 if (netlbl_enabled() || selinux_xfrm_enabled()) {
3944 if (err) 4013 u32 peer_sid;
3945 goto out; 4014
4015 err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
4016 if (err)
4017 return err;
4018 err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
4019 peer_sid, &ad);
4020 if (err)
4021 return err;
4022 err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
4023 PEER__RECV, &ad);
4024 }
4025
4026 if (selinux_secmark_enabled()) {
4027 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
4028 PACKET__RECV, &ad);
4029 if (err)
4030 return err;
4031 }
3946 4032
3947 err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
3948out:
3949 return err; 4033 return err;
3950} 4034}
3951 4035
@@ -3996,18 +4080,25 @@ out:
3996static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) 4080static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
3997{ 4081{
3998 u32 peer_secid = SECSID_NULL; 4082 u32 peer_secid = SECSID_NULL;
3999 int err = 0; 4083 u16 family;
4084
4085 if (sock)
4086 family = sock->sk->sk_family;
4087 else if (skb && skb->sk)
4088 family = skb->sk->sk_family;
4089 else
4090 goto out;
4000 4091
4001 if (sock && sock->sk->sk_family == PF_UNIX) 4092 if (sock && family == PF_UNIX)
4002 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); 4093 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
4003 else if (skb) 4094 else if (skb)
4004 selinux_skb_extlbl_sid(skb, &peer_secid); 4095 selinux_skb_peerlbl_sid(skb, family, &peer_secid);
4005 4096
4006 if (peer_secid == SECSID_NULL) 4097out:
4007 err = -EINVAL;
4008 *secid = peer_secid; 4098 *secid = peer_secid;
4009 4099 if (peer_secid == SECSID_NULL)
4010 return err; 4100 return -EINVAL;
4101 return 0;
4011} 4102}
4012 4103
4013static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) 4104static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
@@ -4027,6 +4118,7 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
4027 4118
4028 newssec->sid = ssec->sid; 4119 newssec->sid = ssec->sid;
4029 newssec->peer_sid = ssec->peer_sid; 4120 newssec->peer_sid = ssec->peer_sid;
4121 newssec->sclass = ssec->sclass;
4030 4122
4031 selinux_netlbl_sk_security_clone(ssec, newssec); 4123 selinux_netlbl_sk_security_clone(ssec, newssec);
4032} 4124}
@@ -4050,6 +4142,7 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent)
4050 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || 4142 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
4051 sk->sk_family == PF_UNIX) 4143 sk->sk_family == PF_UNIX)
4052 isec->sid = sksec->sid; 4144 isec->sid = sksec->sid;
4145 sksec->sclass = isec->sclass;
4053 4146
4054 selinux_netlbl_sock_graft(sk, parent); 4147 selinux_netlbl_sock_graft(sk, parent);
4055} 4148}
@@ -4062,7 +4155,9 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
4062 u32 newsid; 4155 u32 newsid;
4063 u32 peersid; 4156 u32 peersid;
4064 4157
4065 selinux_skb_extlbl_sid(skb, &peersid); 4158 err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
4159 if (err)
4160 return err;
4066 if (peersid == SECSID_NULL) { 4161 if (peersid == SECSID_NULL) {
4067 req->secid = sksec->sid; 4162 req->secid = sksec->sid;
4068 req->peer_secid = SECSID_NULL; 4163 req->peer_secid = SECSID_NULL;
@@ -4100,7 +4195,7 @@ static void selinux_inet_conn_established(struct sock *sk,
4100{ 4195{
4101 struct sk_security_struct *sksec = sk->sk_security; 4196 struct sk_security_struct *sksec = sk->sk_security;
4102 4197
4103 selinux_skb_extlbl_sid(skb, &sksec->peer_sid); 4198 selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
4104} 4199}
4105 4200
4106static void selinux_req_classify_flow(const struct request_sock *req, 4201static void selinux_req_classify_flow(const struct request_sock *req,
@@ -4147,149 +4242,260 @@ out:
4147 4242
4148#ifdef CONFIG_NETFILTER 4243#ifdef CONFIG_NETFILTER
4149 4244
4150static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev, 4245static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
4151 struct avc_audit_data *ad, 4246 u16 family)
4152 u16 family, char *addrp, int len)
4153{ 4247{
4154 int err = 0; 4248 char *addrp;
4155 u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0; 4249 u32 peer_sid;
4156 struct socket *sock; 4250 struct avc_audit_data ad;
4157 struct inode *inode; 4251 u8 secmark_active;
4158 struct inode_security_struct *isec; 4252 u8 peerlbl_active;
4159 4253
4160 sock = sk->sk_socket; 4254 if (!selinux_policycap_netpeer)
4161 if (!sock) 4255 return NF_ACCEPT;
4162 goto out;
4163 4256
4164 inode = SOCK_INODE(sock); 4257 secmark_active = selinux_secmark_enabled();
4165 if (!inode) 4258 peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
4166 goto out; 4259 if (!secmark_active && !peerlbl_active)
4260 return NF_ACCEPT;
4167 4261
4168 isec = inode->i_security; 4262 AVC_AUDIT_DATA_INIT(&ad, NET);
4169 4263 ad.u.net.netif = ifindex;
4170 err = sel_netif_sids(dev, &if_sid, NULL); 4264 ad.u.net.family = family;
4171 if (err) 4265 if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
4172 goto out; 4266 return NF_DROP;
4267
4268 if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
4269 return NF_DROP;
4270
4271 if (peerlbl_active)
4272 if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
4273 peer_sid, &ad) != 0)
4274 return NF_DROP;
4275
4276 if (secmark_active)
4277 if (avc_has_perm(peer_sid, skb->secmark,
4278 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
4279 return NF_DROP;
4280
4281 return NF_ACCEPT;
4282}
4283
4284static unsigned int selinux_ipv4_forward(unsigned int hooknum,
4285 struct sk_buff *skb,
4286 const struct net_device *in,
4287 const struct net_device *out,
4288 int (*okfn)(struct sk_buff *))
4289{
4290 return selinux_ip_forward(skb, in->ifindex, PF_INET);
4291}
4173 4292
4174 switch (isec->sclass) { 4293#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
4294static unsigned int selinux_ipv6_forward(unsigned int hooknum,
4295 struct sk_buff *skb,
4296 const struct net_device *in,
4297 const struct net_device *out,
4298 int (*okfn)(struct sk_buff *))
4299{
4300 return selinux_ip_forward(skb, in->ifindex, PF_INET6);
4301}
4302#endif /* IPV6 */
4303
4304static int selinux_ip_postroute_iptables_compat(struct sock *sk,
4305 int ifindex,
4306 struct avc_audit_data *ad,
4307 u16 family, char *addrp)
4308{
4309 int err;
4310 struct sk_security_struct *sksec = sk->sk_security;
4311 u16 sk_class;
4312 u32 netif_perm, node_perm, send_perm;
4313 u32 port_sid, node_sid, if_sid, sk_sid;
4314
4315 sk_sid = sksec->sid;
4316 sk_class = sksec->sclass;
4317
4318 switch (sk_class) {
4175 case SECCLASS_UDP_SOCKET: 4319 case SECCLASS_UDP_SOCKET:
4176 netif_perm = NETIF__UDP_SEND; 4320 netif_perm = NETIF__UDP_SEND;
4177 node_perm = NODE__UDP_SEND; 4321 node_perm = NODE__UDP_SEND;
4178 send_perm = UDP_SOCKET__SEND_MSG; 4322 send_perm = UDP_SOCKET__SEND_MSG;
4179 break; 4323 break;
4180
4181 case SECCLASS_TCP_SOCKET: 4324 case SECCLASS_TCP_SOCKET:
4182 netif_perm = NETIF__TCP_SEND; 4325 netif_perm = NETIF__TCP_SEND;
4183 node_perm = NODE__TCP_SEND; 4326 node_perm = NODE__TCP_SEND;
4184 send_perm = TCP_SOCKET__SEND_MSG; 4327 send_perm = TCP_SOCKET__SEND_MSG;
4185 break; 4328 break;
4186
4187 case SECCLASS_DCCP_SOCKET: 4329 case SECCLASS_DCCP_SOCKET:
4188 netif_perm = NETIF__DCCP_SEND; 4330 netif_perm = NETIF__DCCP_SEND;
4189 node_perm = NODE__DCCP_SEND; 4331 node_perm = NODE__DCCP_SEND;
4190 send_perm = DCCP_SOCKET__SEND_MSG; 4332 send_perm = DCCP_SOCKET__SEND_MSG;
4191 break; 4333 break;
4192
4193 default: 4334 default:
4194 netif_perm = NETIF__RAWIP_SEND; 4335 netif_perm = NETIF__RAWIP_SEND;
4195 node_perm = NODE__RAWIP_SEND; 4336 node_perm = NODE__RAWIP_SEND;
4337 send_perm = 0;
4196 break; 4338 break;
4197 } 4339 }
4198 4340
4199 err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad); 4341 err = sel_netif_sid(ifindex, &if_sid);
4200 if (err) 4342 if (err)
4201 goto out; 4343 return err;
4344 err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
4345 return err;
4202 4346
4203 err = security_node_sid(family, addrp, len, &node_sid); 4347 err = sel_netnode_sid(addrp, family, &node_sid);
4204 if (err) 4348 if (err)
4205 goto out; 4349 return err;
4206 4350 err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
4207 err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad);
4208 if (err) 4351 if (err)
4209 goto out; 4352 return err;
4210 4353
4211 if (send_perm) { 4354 if (send_perm != 0)
4212 u32 port_sid; 4355 return 0;
4213
4214 err = security_port_sid(sk->sk_family,
4215 sk->sk_type,
4216 sk->sk_protocol,
4217 ntohs(ad->u.net.dport),
4218 &port_sid);
4219 if (err)
4220 goto out;
4221 4356
4222 err = avc_has_perm(isec->sid, port_sid, isec->sclass, 4357 err = security_port_sid(sk->sk_family, sk->sk_type,
4223 send_perm, ad); 4358 sk->sk_protocol, ntohs(ad->u.net.dport),
4359 &port_sid);
4360 if (unlikely(err)) {
4361 printk(KERN_WARNING
4362 "SELinux: failure in"
4363 " selinux_ip_postroute_iptables_compat(),"
4364 " network port label not found\n");
4365 return err;
4224 } 4366 }
4225out: 4367 return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad);
4226 return err;
4227} 4368}
4228 4369
4229static unsigned int selinux_ip_postroute_last(unsigned int hooknum, 4370static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
4230 struct sk_buff *skb, 4371 int ifindex,
4231 const struct net_device *in, 4372 struct avc_audit_data *ad,
4232 const struct net_device *out, 4373 u16 family,
4233 int (*okfn)(struct sk_buff *), 4374 char *addrp,
4234 u16 family) 4375 u8 proto)
4235{ 4376{
4236 char *addrp; 4377 struct sock *sk = skb->sk;
4237 int len, err = 0;
4238 struct sock *sk;
4239 struct avc_audit_data ad;
4240 struct net_device *dev = (struct net_device *)out;
4241 struct sk_security_struct *sksec; 4378 struct sk_security_struct *sksec;
4242 u8 proto;
4243
4244 sk = skb->sk;
4245 if (!sk)
4246 goto out;
4247 4379
4380 if (sk == NULL)
4381 return NF_ACCEPT;
4248 sksec = sk->sk_security; 4382 sksec = sk->sk_security;
4249 4383
4250 AVC_AUDIT_DATA_INIT(&ad, NET); 4384 if (selinux_compat_net) {
4251 ad.u.net.netif = dev->name; 4385 if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
4252 ad.u.net.family = family; 4386 ad, family, addrp))
4387 return NF_DROP;
4388 } else {
4389 if (avc_has_perm(sksec->sid, skb->secmark,
4390 SECCLASS_PACKET, PACKET__SEND, ad))
4391 return NF_DROP;
4392 }
4253 4393
4254 err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto); 4394 if (selinux_policycap_netpeer)
4255 if (err) 4395 if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
4256 goto out; 4396 return NF_DROP;
4257 4397
4258 if (selinux_compat_net) 4398 return NF_ACCEPT;
4259 err = selinux_ip_postroute_last_compat(sk, dev, &ad, 4399}
4260 family, addrp, len);
4261 else
4262 err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
4263 PACKET__SEND, &ad);
4264 4400
4265 if (err) 4401static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
4266 goto out; 4402 u16 family)
4403{
4404 u32 secmark_perm;
4405 u32 peer_sid;
4406 struct sock *sk;
4407 struct avc_audit_data ad;
4408 char *addrp;
4409 u8 proto;
4410 u8 secmark_active;
4411 u8 peerlbl_active;
4267 4412
4268 err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto); 4413 AVC_AUDIT_DATA_INIT(&ad, NET);
4269out: 4414 ad.u.net.netif = ifindex;
4270 return err ? NF_DROP : NF_ACCEPT; 4415 ad.u.net.family = family;
4416 if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
4417 return NF_DROP;
4418
4419 /* If any sort of compatibility mode is enabled then handoff processing
4420 * to the selinux_ip_postroute_compat() function to deal with the
4421 * special handling. We do this in an attempt to keep this function
4422 * as fast and as clean as possible. */
4423 if (selinux_compat_net || !selinux_policycap_netpeer)
4424 return selinux_ip_postroute_compat(skb, ifindex, &ad,
4425 family, addrp, proto);
4426
4427 /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
4428 * packet transformation so allow the packet to pass without any checks
4429 * since we'll have another chance to perform access control checks
4430 * when the packet is on it's final way out.
4431 * NOTE: there appear to be some IPv6 multicast cases where skb->dst
4432 * is NULL, in this case go ahead and apply access control. */
4433 if (skb->dst != NULL && skb->dst->xfrm != NULL)
4434 return NF_ACCEPT;
4435
4436 secmark_active = selinux_secmark_enabled();
4437 peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
4438 if (!secmark_active && !peerlbl_active)
4439 return NF_ACCEPT;
4440
4441 /* if the packet is locally generated (skb->sk != NULL) then use the
4442 * socket's label as the peer label, otherwise the packet is being
4443 * forwarded through this system and we need to fetch the peer label
4444 * directly from the packet */
4445 sk = skb->sk;
4446 if (sk) {
4447 struct sk_security_struct *sksec = sk->sk_security;
4448 peer_sid = sksec->sid;
4449 secmark_perm = PACKET__SEND;
4450 } else {
4451 if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
4452 return NF_DROP;
4453 secmark_perm = PACKET__FORWARD_OUT;
4454 }
4455
4456 if (secmark_active)
4457 if (avc_has_perm(peer_sid, skb->secmark,
4458 SECCLASS_PACKET, secmark_perm, &ad))
4459 return NF_DROP;
4460
4461 if (peerlbl_active) {
4462 u32 if_sid;
4463 u32 node_sid;
4464
4465 if (sel_netif_sid(ifindex, &if_sid))
4466 return NF_DROP;
4467 if (avc_has_perm(peer_sid, if_sid,
4468 SECCLASS_NETIF, NETIF__EGRESS, &ad))
4469 return NF_DROP;
4470
4471 if (sel_netnode_sid(addrp, family, &node_sid))
4472 return NF_DROP;
4473 if (avc_has_perm(peer_sid, node_sid,
4474 SECCLASS_NODE, NODE__SENDTO, &ad))
4475 return NF_DROP;
4476 }
4477
4478 return NF_ACCEPT;
4271} 4479}
4272 4480
4273static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum, 4481static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
4274 struct sk_buff *skb, 4482 struct sk_buff *skb,
4275 const struct net_device *in, 4483 const struct net_device *in,
4276 const struct net_device *out, 4484 const struct net_device *out,
4277 int (*okfn)(struct sk_buff *)) 4485 int (*okfn)(struct sk_buff *))
4278{ 4486{
4279 return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET); 4487 return selinux_ip_postroute(skb, out->ifindex, PF_INET);
4280} 4488}
4281 4489
4282#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 4490#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
4283 4491static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
4284static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum, 4492 struct sk_buff *skb,
4285 struct sk_buff *skb, 4493 const struct net_device *in,
4286 const struct net_device *in, 4494 const struct net_device *out,
4287 const struct net_device *out, 4495 int (*okfn)(struct sk_buff *))
4288 int (*okfn)(struct sk_buff *))
4289{ 4496{
4290 return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6); 4497 return selinux_ip_postroute(skb, out->ifindex, PF_INET6);
4291} 4498}
4292
4293#endif /* IPV6 */ 4499#endif /* IPV6 */
4294 4500
4295#endif /* CONFIG_NETFILTER */ 4501#endif /* CONFIG_NETFILTER */
@@ -5277,22 +5483,40 @@ security_initcall(selinux_init);
5277 5483
5278#if defined(CONFIG_NETFILTER) 5484#if defined(CONFIG_NETFILTER)
5279 5485
5280static struct nf_hook_ops selinux_ipv4_op = { 5486static struct nf_hook_ops selinux_ipv4_ops[] = {
5281 .hook = selinux_ipv4_postroute_last, 5487 {
5282 .owner = THIS_MODULE, 5488 .hook = selinux_ipv4_postroute,
5283 .pf = PF_INET, 5489 .owner = THIS_MODULE,
5284 .hooknum = NF_INET_POST_ROUTING, 5490 .pf = PF_INET,
5285 .priority = NF_IP_PRI_SELINUX_LAST, 5491 .hooknum = NF_INET_POST_ROUTING,
5492 .priority = NF_IP_PRI_SELINUX_LAST,
5493 },
5494 {
5495 .hook = selinux_ipv4_forward,
5496 .owner = THIS_MODULE,
5497 .pf = PF_INET,
5498 .hooknum = NF_INET_FORWARD,
5499 .priority = NF_IP_PRI_SELINUX_FIRST,
5500 }
5286}; 5501};
5287 5502
5288#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5503#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5289 5504
5290static struct nf_hook_ops selinux_ipv6_op = { 5505static struct nf_hook_ops selinux_ipv6_ops[] = {
5291 .hook = selinux_ipv6_postroute_last, 5506 {
5292 .owner = THIS_MODULE, 5507 .hook = selinux_ipv6_postroute,
5293 .pf = PF_INET6, 5508 .owner = THIS_MODULE,
5294 .hooknum = NF_INET_POST_ROUTING, 5509 .pf = PF_INET6,
5295 .priority = NF_IP6_PRI_SELINUX_LAST, 5510 .hooknum = NF_INET_POST_ROUTING,
5511 .priority = NF_IP6_PRI_SELINUX_LAST,
5512 },
5513 {
5514 .hook = selinux_ipv6_forward,
5515 .owner = THIS_MODULE,
5516 .pf = PF_INET6,
5517 .hooknum = NF_INET_FORWARD,
5518 .priority = NF_IP6_PRI_SELINUX_FIRST,
5519 }
5296}; 5520};
5297 5521
5298#endif /* IPV6 */ 5522#endif /* IPV6 */
@@ -5300,22 +5524,27 @@ static struct nf_hook_ops selinux_ipv6_op = {
5300static int __init selinux_nf_ip_init(void) 5524static int __init selinux_nf_ip_init(void)
5301{ 5525{
5302 int err = 0; 5526 int err = 0;
5527 u32 iter;
5303 5528
5304 if (!selinux_enabled) 5529 if (!selinux_enabled)
5305 goto out; 5530 goto out;
5306 5531
5307 printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); 5532 printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n");
5308 5533
5309 err = nf_register_hook(&selinux_ipv4_op); 5534 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) {
5310 if (err) 5535 err = nf_register_hook(&selinux_ipv4_ops[iter]);
5311 panic("SELinux: nf_register_hook for IPv4: error %d\n", err); 5536 if (err)
5537 panic("SELinux: nf_register_hook for IPv4: error %d\n",
5538 err);
5539 }
5312 5540
5313#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5541#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5314 5542 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) {
5315 err = nf_register_hook(&selinux_ipv6_op); 5543 err = nf_register_hook(&selinux_ipv6_ops[iter]);
5316 if (err) 5544 if (err)
5317 panic("SELinux: nf_register_hook for IPv6: error %d\n", err); 5545 panic("SELinux: nf_register_hook for IPv6: error %d\n",
5318 5546 err);
5547 }
5319#endif /* IPV6 */ 5548#endif /* IPV6 */
5320 5549
5321out: 5550out:
@@ -5327,11 +5556,15 @@ __initcall(selinux_nf_ip_init);
5327#ifdef CONFIG_SECURITY_SELINUX_DISABLE 5556#ifdef CONFIG_SECURITY_SELINUX_DISABLE
5328static void selinux_nf_ip_exit(void) 5557static void selinux_nf_ip_exit(void)
5329{ 5558{
5559 u32 iter;
5560
5330 printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); 5561 printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n");
5331 5562
5332 nf_unregister_hook(&selinux_ipv4_op); 5563 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++)
5564 nf_unregister_hook(&selinux_ipv4_ops[iter]);
5333#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 5565#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
5334 nf_unregister_hook(&selinux_ipv6_op); 5566 for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++)
5567 nf_unregister_hook(&selinux_ipv6_ops[iter]);
5335#endif /* IPV6 */ 5568#endif /* IPV6 */
5336} 5569}
5337#endif 5570#endif
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 049bf69429b6..399f868c5c8f 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -37,6 +37,8 @@
37 S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") 37 S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest")
38 S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") 38 S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv")
39 S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") 39 S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send")
40 S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom")
41 S_(SECCLASS_NODE, NODE__SENDTO, "sendto")
40 S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") 42 S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv")
41 S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") 43 S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send")
42 S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") 44 S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv")
@@ -45,6 +47,8 @@
45 S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") 47 S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send")
46 S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") 48 S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv")
47 S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") 49 S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send")
50 S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress")
51 S_(SECCLASS_NETIF, NETIF__EGRESS, "egress")
48 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") 52 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto")
49 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") 53 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn")
50 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") 54 S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom")
@@ -149,6 +153,10 @@
149 S_(SECCLASS_PACKET, PACKET__SEND, "send") 153 S_(SECCLASS_PACKET, PACKET__SEND, "send")
150 S_(SECCLASS_PACKET, PACKET__RECV, "recv") 154 S_(SECCLASS_PACKET, PACKET__RECV, "recv")
151 S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") 155 S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
156 S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in")
157 S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out")
158 S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in")
159 S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out")
152 S_(SECCLASS_KEY, KEY__VIEW, "view") 160 S_(SECCLASS_KEY, KEY__VIEW, "view")
153 S_(SECCLASS_KEY, KEY__READ, "read") 161 S_(SECCLASS_KEY, KEY__READ, "read")
154 S_(SECCLASS_KEY, KEY__WRITE, "write") 162 S_(SECCLASS_KEY, KEY__WRITE, "write")
@@ -159,3 +167,4 @@
159 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") 167 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
160 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") 168 S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
161 S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") 169 S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
170 S_(SECCLASS_PEER, PEER__RECV, "recv")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index eda89a2ec635..84c9abc80978 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -292,6 +292,8 @@
292#define NODE__ENFORCE_DEST 0x00000040UL 292#define NODE__ENFORCE_DEST 0x00000040UL
293#define NODE__DCCP_RECV 0x00000080UL 293#define NODE__DCCP_RECV 0x00000080UL
294#define NODE__DCCP_SEND 0x00000100UL 294#define NODE__DCCP_SEND 0x00000100UL
295#define NODE__RECVFROM 0x00000200UL
296#define NODE__SENDTO 0x00000400UL
295#define NETIF__TCP_RECV 0x00000001UL 297#define NETIF__TCP_RECV 0x00000001UL
296#define NETIF__TCP_SEND 0x00000002UL 298#define NETIF__TCP_SEND 0x00000002UL
297#define NETIF__UDP_RECV 0x00000004UL 299#define NETIF__UDP_RECV 0x00000004UL
@@ -300,6 +302,8 @@
300#define NETIF__RAWIP_SEND 0x00000020UL 302#define NETIF__RAWIP_SEND 0x00000020UL
301#define NETIF__DCCP_RECV 0x00000040UL 303#define NETIF__DCCP_RECV 0x00000040UL
302#define NETIF__DCCP_SEND 0x00000080UL 304#define NETIF__DCCP_SEND 0x00000080UL
305#define NETIF__INGRESS 0x00000100UL
306#define NETIF__EGRESS 0x00000200UL
303#define NETLINK_SOCKET__IOCTL 0x00000001UL 307#define NETLINK_SOCKET__IOCTL 0x00000001UL
304#define NETLINK_SOCKET__READ 0x00000002UL 308#define NETLINK_SOCKET__READ 0x00000002UL
305#define NETLINK_SOCKET__WRITE 0x00000004UL 309#define NETLINK_SOCKET__WRITE 0x00000004UL
@@ -792,6 +796,10 @@
792#define PACKET__SEND 0x00000001UL 796#define PACKET__SEND 0x00000001UL
793#define PACKET__RECV 0x00000002UL 797#define PACKET__RECV 0x00000002UL
794#define PACKET__RELABELTO 0x00000004UL 798#define PACKET__RELABELTO 0x00000004UL
799#define PACKET__FLOW_IN 0x00000008UL
800#define PACKET__FLOW_OUT 0x00000010UL
801#define PACKET__FORWARD_IN 0x00000020UL
802#define PACKET__FORWARD_OUT 0x00000040UL
795#define KEY__VIEW 0x00000001UL 803#define KEY__VIEW 0x00000001UL
796#define KEY__READ 0x00000002UL 804#define KEY__READ 0x00000002UL
797#define KEY__WRITE 0x00000004UL 805#define KEY__WRITE 0x00000004UL
@@ -824,3 +832,4 @@
824#define DCCP_SOCKET__NODE_BIND 0x00400000UL 832#define DCCP_SOCKET__NODE_BIND 0x00400000UL
825#define DCCP_SOCKET__NAME_CONNECT 0x00800000UL 833#define DCCP_SOCKET__NAME_CONNECT 0x00800000UL
826#define MEMPROTECT__MMAP_ZERO 0x00000001UL 834#define MEMPROTECT__MMAP_ZERO 0x00000001UL
835#define PEER__RECV 0x00000001UL
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 553607a19e92..80c28fa6621c 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -51,7 +51,7 @@ struct avc_audit_data {
51 struct inode *inode; 51 struct inode *inode;
52 } fs; 52 } fs;
53 struct { 53 struct {
54 char *netif; 54 int netif;
55 struct sock *sk; 55 struct sock *sk;
56 u16 family; 56 u16 family;
57 __be16 dport; 57 __be16 dport;
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index e77de0e62ea0..b1b0d1d8f950 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -64,3 +64,10 @@
64 S_(NULL) 64 S_(NULL)
65 S_("dccp_socket") 65 S_("dccp_socket")
66 S_("memprotect") 66 S_("memprotect")
67 S_(NULL)
68 S_(NULL)
69 S_(NULL)
70 S_(NULL)
71 S_(NULL)
72 S_(NULL)
73 S_("peer")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index a9c2b20f14b5..09e9dd23ee1a 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -50,6 +50,7 @@
50#define SECCLASS_KEY 58 50#define SECCLASS_KEY 58
51#define SECCLASS_DCCP_SOCKET 60 51#define SECCLASS_DCCP_SOCKET 60
52#define SECCLASS_MEMPROTECT 61 52#define SECCLASS_MEMPROTECT 61
53#define SECCLASS_PEER 68
53 54
54/* 55/*
55 * Security identifier indices for initial entities 56 * Security identifier indices for initial entities
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h
index 8bd6f9992d2b..ce23edd128b3 100644
--- a/security/selinux/include/netif.h
+++ b/security/selinux/include/netif.h
@@ -7,6 +7,8 @@
7 * Author: James Morris <jmorris@redhat.com> 7 * Author: James Morris <jmorris@redhat.com>
8 * 8 *
9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
10 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
11 * Paul Moore, <paul.moore@hp.com>
10 * 12 *
11 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2, 14 * it under the terms of the GNU General Public License version 2,
@@ -15,7 +17,7 @@
15#ifndef _SELINUX_NETIF_H_ 17#ifndef _SELINUX_NETIF_H_
16#define _SELINUX_NETIF_H_ 18#define _SELINUX_NETIF_H_
17 19
18int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid); 20int sel_netif_sid(int ifindex, u32 *sid);
19 21
20#endif /* _SELINUX_NETIF_H_ */ 22#endif /* _SELINUX_NETIF_H_ */
21 23
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h
index 218e3f77c350..00a2809c8506 100644
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -46,13 +46,17 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
46void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, 46void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
47 struct sk_security_struct *newssec); 47 struct sk_security_struct *newssec);
48 48
49int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); 49int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
50 u16 family,
51 u32 *type,
52 u32 *sid);
50 53
51void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); 54void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
52int selinux_netlbl_socket_post_create(struct socket *sock); 55int selinux_netlbl_socket_post_create(struct socket *sock);
53int selinux_netlbl_inode_permission(struct inode *inode, int mask); 56int selinux_netlbl_inode_permission(struct inode *inode, int mask);
54int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 57int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
55 struct sk_buff *skb, 58 struct sk_buff *skb,
59 u16 family,
56 struct avc_audit_data *ad); 60 struct avc_audit_data *ad);
57int selinux_netlbl_socket_setsockopt(struct socket *sock, 61int selinux_netlbl_socket_setsockopt(struct socket *sock,
58 int level, 62 int level,
@@ -83,9 +87,11 @@ static inline void selinux_netlbl_sk_security_clone(
83} 87}
84 88
85static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, 89static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
86 u32 base_sid, 90 u16 family,
91 u32 *type,
87 u32 *sid) 92 u32 *sid)
88{ 93{
94 *type = NETLBL_NLTYPE_NONE;
89 *sid = SECSID_NULL; 95 *sid = SECSID_NULL;
90 return 0; 96 return 0;
91} 97}
@@ -106,6 +112,7 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode,
106} 112}
107static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 113static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
108 struct sk_buff *skb, 114 struct sk_buff *skb,
115 u16 family,
109 struct avc_audit_data *ad) 116 struct avc_audit_data *ad)
110{ 117{
111 return 0; 118 return 0;
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h
new file mode 100644
index 000000000000..1b94450d11d2
--- /dev/null
+++ b/security/selinux/include/netnode.h
@@ -0,0 +1,32 @@
1/*
2 * Network node table
3 *
4 * SELinux must keep a mapping of network nodes to labels/SIDs. This
5 * mapping is maintained as part of the normal policy but a fast cache is
6 * needed to reduce the lookup overhead since most of these queries happen on
7 * a per-packet basis.
8 *
9 * Author: Paul Moore <paul.moore@hp.com>
10 *
11 */
12
13/*
14 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
15 *
16 * This program is free software: you can redistribute it and/or modify
17 * it under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 */
26
27#ifndef _SELINUX_NETNODE_H
28#define _SELINUX_NETNODE_H
29
30int sel_netnode_sid(void *addr, u16 family, u32 *sid);
31
32#endif
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 4138a80f8e27..c6c2bb4ebacc 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -96,17 +96,25 @@ struct bprm_security_struct {
96}; 96};
97 97
98struct netif_security_struct { 98struct netif_security_struct {
99 struct net_device *dev; /* back pointer */ 99 int ifindex; /* device index */
100 u32 if_sid; /* SID for this interface */ 100 u32 sid; /* SID for this interface */
101 u32 msg_sid; /* default SID for messages received on this interface */ 101};
102
103struct netnode_security_struct {
104 union {
105 __be32 ipv4; /* IPv4 node address */
106 struct in6_addr ipv6; /* IPv6 node address */
107 } addr;
108 u32 sid; /* SID for this node */
109 u16 family; /* address family */
102}; 110};
103 111
104struct sk_security_struct { 112struct sk_security_struct {
105 struct sock *sk; /* back pointer to sk object */ 113 struct sock *sk; /* back pointer to sk object */
106 u32 sid; /* SID of this object */ 114 u32 sid; /* SID of this object */
107 u32 peer_sid; /* SID of peer */ 115 u32 peer_sid; /* SID of peer */
108#ifdef CONFIG_NETLABEL
109 u16 sclass; /* sock security class */ 116 u16 sclass; /* sock security class */
117#ifdef CONFIG_NETLABEL
110 enum { /* NetLabel state */ 118 enum { /* NetLabel state */
111 NLBL_UNSET = 0, 119 NLBL_UNSET = 0,
112 NLBL_REQUIRE, 120 NLBL_REQUIRE,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 39337afffec2..23137c17f917 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -25,13 +25,14 @@
25#define POLICYDB_VERSION_MLS 19 25#define POLICYDB_VERSION_MLS 19
26#define POLICYDB_VERSION_AVTAB 20 26#define POLICYDB_VERSION_AVTAB 20
27#define POLICYDB_VERSION_RANGETRANS 21 27#define POLICYDB_VERSION_RANGETRANS 21
28#define POLICYDB_VERSION_POLCAP 22
28 29
29/* Range of policy versions we understand*/ 30/* Range of policy versions we understand*/
30#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE 31#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE
31#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX 32#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
32#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE 33#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
33#else 34#else
34#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS 35#define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP
35#endif 36#endif
36 37
37struct netlbl_lsm_secattr; 38struct netlbl_lsm_secattr;
@@ -39,8 +40,19 @@ struct netlbl_lsm_secattr;
39extern int selinux_enabled; 40extern int selinux_enabled;
40extern int selinux_mls_enabled; 41extern int selinux_mls_enabled;
41 42
43/* Policy capabilities */
44enum {
45 POLICYDB_CAPABILITY_NETPEER,
46 __POLICYDB_CAPABILITY_MAX
47};
48#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
49
50extern int selinux_policycap_netpeer;
51
42int security_load_policy(void * data, size_t len); 52int security_load_policy(void * data, size_t len);
43 53
54int security_policycap_supported(unsigned int req_cap);
55
44#define SEL_VEC_MAX 32 56#define SEL_VEC_MAX 32
45struct av_decision { 57struct av_decision {
46 u32 allowed; 58 u32 allowed;
@@ -77,8 +89,7 @@ int security_get_user_sids(u32 callsid, char *username,
77int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, 89int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port,
78 u32 *out_sid); 90 u32 *out_sid);
79 91
80int security_netif_sid(char *name, u32 *if_sid, 92int security_netif_sid(char *name, u32 *if_sid);
81 u32 *msg_sid);
82 93
83int security_node_sid(u16 domain, void *addr, u32 addrlen, 94int security_node_sid(u16 domain, void *addr, u32 addrlen,
84 u32 *out_sid); 95 u32 *out_sid);
@@ -88,10 +99,15 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
88 99
89int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); 100int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);
90 101
102int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
103 u32 xfrm_sid,
104 u32 *peer_sid);
105
91int security_get_classes(char ***classes, int *nclasses); 106int security_get_classes(char ***classes, int *nclasses);
92int security_get_permissions(char *class, char ***perms, int *nperms); 107int security_get_permissions(char *class, char ***perms, int *nperms);
93int security_get_reject_unknown(void); 108int security_get_reject_unknown(void);
94int security_get_allow_unknown(void); 109int security_get_allow_unknown(void);
110int security_get_policycaps(int *len, int **values);
95 111
96#define SECURITY_FS_USE_XATTR 1 /* use xattr */ 112#define SECURITY_FS_USE_XATTR 1 /* use xattr */
97#define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ 113#define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */
@@ -108,7 +124,6 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass,
108 124
109#ifdef CONFIG_NETLABEL 125#ifdef CONFIG_NETLABEL
110int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, 126int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
111 u32 base_sid,
112 u32 *sid); 127 u32 *sid);
113 128
114int security_netlbl_sid_to_secattr(u32 sid, 129int security_netlbl_sid_to_secattr(u32 sid,
@@ -116,7 +131,6 @@ int security_netlbl_sid_to_secattr(u32 sid,
116#else 131#else
117static inline int security_netlbl_secattr_to_sid( 132static inline int security_netlbl_secattr_to_sid(
118 struct netlbl_lsm_secattr *secattr, 133 struct netlbl_lsm_secattr *secattr,
119 u32 base_sid,
120 u32 *sid) 134 u32 *sid)
121{ 135{
122 return -EIDRM; 136 return -EIDRM;
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 31929e39f5ca..36b0510efa7b 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -32,6 +32,13 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
32} 32}
33 33
34#ifdef CONFIG_SECURITY_NETWORK_XFRM 34#ifdef CONFIG_SECURITY_NETWORK_XFRM
35extern atomic_t selinux_xfrm_refcount;
36
37static inline int selinux_xfrm_enabled(void)
38{
39 return (atomic_read(&selinux_xfrm_refcount) > 0);
40}
41
35int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, 42int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
36 struct avc_audit_data *ad); 43 struct avc_audit_data *ad);
37int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, 44int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
@@ -43,6 +50,11 @@ static inline void selinux_xfrm_notify_policyload(void)
43 atomic_inc(&flow_cache_genid); 50 atomic_inc(&flow_cache_genid);
44} 51}
45#else 52#else
53static inline int selinux_xfrm_enabled(void)
54{
55 return 0;
56}
57
46static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, 58static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
47 struct avc_audit_data *ad) 59 struct avc_audit_data *ad)
48{ 60{
diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index e87ab948104c..013d3117a86b 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -7,6 +7,8 @@
7 * Author: James Morris <jmorris@redhat.com> 7 * Author: James Morris <jmorris@redhat.com>
8 * 8 *
9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 9 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
10 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
11 * Paul Moore <paul.moore@hp.com>
10 * 12 *
11 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2, 14 * it under the terms of the GNU General Public License version 2,
@@ -29,14 +31,6 @@
29#define SEL_NETIF_HASH_SIZE 64 31#define SEL_NETIF_HASH_SIZE 64
30#define SEL_NETIF_HASH_MAX 1024 32#define SEL_NETIF_HASH_MAX 1024
31 33
32#undef DEBUG
33
34#ifdef DEBUG
35#define DEBUGP printk
36#else
37#define DEBUGP(format, args...)
38#endif
39
40struct sel_netif 34struct sel_netif
41{ 35{
42 struct list_head list; 36 struct list_head list;
@@ -49,174 +43,226 @@ static LIST_HEAD(sel_netif_list);
49static DEFINE_SPINLOCK(sel_netif_lock); 43static DEFINE_SPINLOCK(sel_netif_lock);
50static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; 44static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
51 45
52static inline u32 sel_netif_hasfn(struct net_device *dev) 46/**
47 * sel_netif_hashfn - Hashing function for the interface table
48 * @ifindex: the network interface
49 *
50 * Description:
51 * This is the hashing function for the network interface table, it returns the
52 * bucket number for the given interface.
53 *
54 */
55static inline u32 sel_netif_hashfn(int ifindex)
53{ 56{
54 return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1)); 57 return (ifindex & (SEL_NETIF_HASH_SIZE - 1));
55} 58}
56 59
57/* 60/**
58 * All of the devices should normally fit in the hash, so we optimize 61 * sel_netif_find - Search for an interface record
59 * for that case. 62 * @ifindex: the network interface
63 *
64 * Description:
65 * Search the network interface table and return the record matching @ifindex.
66 * If an entry can not be found in the table return NULL.
67 *
60 */ 68 */
61static inline struct sel_netif *sel_netif_find(struct net_device *dev) 69static inline struct sel_netif *sel_netif_find(int ifindex)
62{ 70{
63 struct list_head *pos; 71 int idx = sel_netif_hashfn(ifindex);
64 int idx = sel_netif_hasfn(dev); 72 struct sel_netif *netif;
65 73
66 __list_for_each_rcu(pos, &sel_netif_hash[idx]) { 74 list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
67 struct sel_netif *netif = list_entry(pos, 75 /* all of the devices should normally fit in the hash, so we
68 struct sel_netif, list); 76 * optimize for that case */
69 if (likely(netif->nsec.dev == dev)) 77 if (likely(netif->nsec.ifindex == ifindex))
70 return netif; 78 return netif;
71 } 79
72 return NULL; 80 return NULL;
73} 81}
74 82
83/**
84 * sel_netif_insert - Insert a new interface into the table
85 * @netif: the new interface record
86 *
87 * Description:
88 * Add a new interface record to the network interface hash table. Returns
89 * zero on success, negative values on failure.
90 *
91 */
75static int sel_netif_insert(struct sel_netif *netif) 92static int sel_netif_insert(struct sel_netif *netif)
76{ 93{
77 int idx, ret = 0; 94 int idx;
78 95
79 if (sel_netif_total >= SEL_NETIF_HASH_MAX) { 96 if (sel_netif_total >= SEL_NETIF_HASH_MAX)
80 ret = -ENOSPC; 97 return -ENOSPC;
81 goto out;
82 }
83 98
84 idx = sel_netif_hasfn(netif->nsec.dev); 99 idx = sel_netif_hashfn(netif->nsec.ifindex);
85 list_add_rcu(&netif->list, &sel_netif_hash[idx]); 100 list_add_rcu(&netif->list, &sel_netif_hash[idx]);
86 sel_netif_total++; 101 sel_netif_total++;
87out: 102
88 return ret; 103 return 0;
89} 104}
90 105
106/**
107 * sel_netif_free - Frees an interface entry
108 * @p: the entry's RCU field
109 *
110 * Description:
111 * This function is designed to be used as a callback to the call_rcu()
112 * function so that memory allocated to a hash table interface entry can be
113 * released safely.
114 *
115 */
91static void sel_netif_free(struct rcu_head *p) 116static void sel_netif_free(struct rcu_head *p)
92{ 117{
93 struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); 118 struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
94
95 DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
96 kfree(netif); 119 kfree(netif);
97} 120}
98 121
122/**
123 * sel_netif_destroy - Remove an interface record from the table
124 * @netif: the existing interface record
125 *
126 * Description:
127 * Remove an existing interface record from the network interface table.
128 *
129 */
99static void sel_netif_destroy(struct sel_netif *netif) 130static void sel_netif_destroy(struct sel_netif *netif)
100{ 131{
101 DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
102
103 list_del_rcu(&netif->list); 132 list_del_rcu(&netif->list);
104 sel_netif_total--; 133 sel_netif_total--;
105 call_rcu(&netif->rcu_head, sel_netif_free); 134 call_rcu(&netif->rcu_head, sel_netif_free);
106} 135}
107 136
108static struct sel_netif *sel_netif_lookup(struct net_device *dev) 137/**
138 * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
139 * @ifindex: the network interface
140 * @sid: interface SID
141 *
142 * Description:
143 * This function determines the SID of a network interface by quering the
144 * security policy. The result is added to the network interface table to
145 * speedup future queries. Returns zero on success, negative values on
146 * failure.
147 *
148 */
149static int sel_netif_sid_slow(int ifindex, u32 *sid)
109{ 150{
110 int ret; 151 int ret;
111 struct sel_netif *netif, *new; 152 struct sel_netif *netif;
112 struct netif_security_struct *nsec; 153 struct sel_netif *new = NULL;
113 154 struct net_device *dev;
114 netif = sel_netif_find(dev); 155
115 if (likely(netif != NULL)) 156 /* NOTE: we always use init's network namespace since we don't
116 goto out; 157 * currently support containers */
117 158
118 new = kzalloc(sizeof(*new), GFP_ATOMIC); 159 dev = dev_get_by_index(&init_net, ifindex);
119 if (!new) { 160 if (unlikely(dev == NULL)) {
120 netif = ERR_PTR(-ENOMEM); 161 printk(KERN_WARNING
121 goto out; 162 "SELinux: failure in sel_netif_sid_slow(),"
163 " invalid network interface (%d)\n", ifindex);
164 return -ENOENT;
122 } 165 }
123
124 nsec = &new->nsec;
125 166
126 ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid); 167 spin_lock_bh(&sel_netif_lock);
127 if (ret < 0) { 168 netif = sel_netif_find(ifindex);
128 kfree(new); 169 if (netif != NULL) {
129 netif = ERR_PTR(ret); 170 *sid = netif->nsec.sid;
171 ret = 0;
130 goto out; 172 goto out;
131 } 173 }
132 174 new = kzalloc(sizeof(*new), GFP_ATOMIC);
133 nsec->dev = dev; 175 if (new == NULL) {
134 176 ret = -ENOMEM;
135 spin_lock_bh(&sel_netif_lock);
136
137 netif = sel_netif_find(dev);
138 if (netif) {
139 spin_unlock_bh(&sel_netif_lock);
140 kfree(new);
141 goto out; 177 goto out;
142 } 178 }
143 179 ret = security_netif_sid(dev->name, &new->nsec.sid);
180 if (ret != 0)
181 goto out;
182 new->nsec.ifindex = ifindex;
144 ret = sel_netif_insert(new); 183 ret = sel_netif_insert(new);
145 spin_unlock_bh(&sel_netif_lock); 184 if (ret != 0)
146
147 if (ret) {
148 kfree(new);
149 netif = ERR_PTR(ret);
150 goto out; 185 goto out;
151 } 186 *sid = new->nsec.sid;
152 187
153 netif = new;
154
155 DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name,
156 nsec->if_sid, nsec->msg_sid);
157out: 188out:
158 return netif; 189 spin_unlock_bh(&sel_netif_lock);
159} 190 dev_put(dev);
160 191 if (unlikely(ret)) {
161static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out) 192 printk(KERN_WARNING
162{ 193 "SELinux: failure in sel_netif_sid_slow(),"
163 if (if_sid_out) 194 " unable to determine network interface label (%d)\n",
164 *if_sid_out = if_sid_in; 195 ifindex);
165 if (msg_sid_out) 196 kfree(new);
166 *msg_sid_out = msg_sid_in; 197 }
167}
168
169static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
170{
171 int ret = 0;
172 u32 tmp_if_sid, tmp_msg_sid;
173
174 ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid);
175 if (!ret)
176 sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid);
177 return ret; 198 return ret;
178} 199}
179 200
180int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid) 201/**
202 * sel_netif_sid - Lookup the SID of a network interface
203 * @ifindex: the network interface
204 * @sid: interface SID
205 *
206 * Description:
207 * This function determines the SID of a network interface using the fastest
208 * method possible. First the interface table is queried, but if an entry
209 * can't be found then the policy is queried and the result is added to the
210 * table to speedup future queries. Returns zero on success, negative values
211 * on failure.
212 *
213 */
214int sel_netif_sid(int ifindex, u32 *sid)
181{ 215{
182 int ret = 0;
183 struct sel_netif *netif; 216 struct sel_netif *netif;
184 217
185 rcu_read_lock(); 218 rcu_read_lock();
186 netif = sel_netif_lookup(dev); 219 netif = sel_netif_find(ifindex);
187 if (IS_ERR(netif)) { 220 if (likely(netif != NULL)) {
221 *sid = netif->nsec.sid;
188 rcu_read_unlock(); 222 rcu_read_unlock();
189 ret = sel_netif_sids_slow(dev, if_sid, msg_sid); 223 return 0;
190 goto out;
191 } 224 }
192 sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
193 rcu_read_unlock(); 225 rcu_read_unlock();
194out: 226
195 return ret; 227 return sel_netif_sid_slow(ifindex, sid);
196} 228}
197 229
198static void sel_netif_kill(struct net_device *dev) 230/**
231 * sel_netif_kill - Remove an entry from the network interface table
232 * @ifindex: the network interface
233 *
234 * Description:
235 * This function removes the entry matching @ifindex from the network interface
236 * table if it exists.
237 *
238 */
239static void sel_netif_kill(int ifindex)
199{ 240{
200 struct sel_netif *netif; 241 struct sel_netif *netif;
201 242
202 spin_lock_bh(&sel_netif_lock); 243 spin_lock_bh(&sel_netif_lock);
203 netif = sel_netif_find(dev); 244 netif = sel_netif_find(ifindex);
204 if (netif) 245 if (netif)
205 sel_netif_destroy(netif); 246 sel_netif_destroy(netif);
206 spin_unlock_bh(&sel_netif_lock); 247 spin_unlock_bh(&sel_netif_lock);
207} 248}
208 249
250/**
251 * sel_netif_flush - Flush the entire network interface table
252 *
253 * Description:
254 * Remove all entries from the network interface table.
255 *
256 */
209static void sel_netif_flush(void) 257static void sel_netif_flush(void)
210{ 258{
211 int idx; 259 int idx;
260 struct sel_netif *netif;
212 261
213 spin_lock_bh(&sel_netif_lock); 262 spin_lock_bh(&sel_netif_lock);
214 for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { 263 for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
215 struct sel_netif *netif;
216
217 list_for_each_entry(netif, &sel_netif_hash[idx], list) 264 list_for_each_entry(netif, &sel_netif_hash[idx], list)
218 sel_netif_destroy(netif); 265 sel_netif_destroy(netif);
219 }
220 spin_unlock_bh(&sel_netif_lock); 266 spin_unlock_bh(&sel_netif_lock);
221} 267}
222 268
@@ -239,7 +285,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
239 return NOTIFY_DONE; 285 return NOTIFY_DONE;
240 286
241 if (event == NETDEV_DOWN) 287 if (event == NETDEV_DOWN)
242 sel_netif_kill(dev); 288 sel_netif_kill(dev->ifindex);
243 289
244 return NOTIFY_DONE; 290 return NOTIFY_DONE;
245} 291}
@@ -250,10 +296,10 @@ static struct notifier_block sel_netif_netdev_notifier = {
250 296
251static __init int sel_netif_init(void) 297static __init int sel_netif_init(void)
252{ 298{
253 int i, err = 0; 299 int i, err;
254 300
255 if (!selinux_enabled) 301 if (!selinux_enabled)
256 goto out; 302 return 0;
257 303
258 for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) 304 for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
259 INIT_LIST_HEAD(&sel_netif_hash[i]); 305 INIT_LIST_HEAD(&sel_netif_hash[i]);
@@ -265,7 +311,6 @@ static __init int sel_netif_init(void)
265 if (err) 311 if (err)
266 panic("avc_add_callback() failed, error %d\n", err); 312 panic("avc_add_callback() failed, error %d\n", err);
267 313
268out:
269 return err; 314 return err;
270} 315}
271 316
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 66e013d6f6f6..0fa2be4149e8 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -36,6 +36,33 @@
36#include "security.h" 36#include "security.h"
37 37
38/** 38/**
39 * selinux_netlbl_sidlookup_cached - Cache a SID lookup
40 * @skb: the packet
41 * @secattr: the NetLabel security attributes
42 * @sid: the SID
43 *
44 * Description:
45 * Query the SELinux security server to lookup the correct SID for the given
46 * security attributes. If the query is successful, cache the result to speed
47 * up future lookups. Returns zero on success, negative values on failure.
48 *
49 */
50static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
51 struct netlbl_lsm_secattr *secattr,
52 u32 *sid)
53{
54 int rc;
55
56 rc = security_netlbl_secattr_to_sid(secattr, sid);
57 if (rc == 0 &&
58 (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
59 (secattr->flags & NETLBL_SECATTR_CACHE))
60 netlbl_cache_add(skb, secattr);
61
62 return rc;
63}
64
65/**
39 * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism 66 * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism
40 * @sk: the socket to label 67 * @sk: the socket to label
41 * @sid: the SID to use 68 * @sid: the SID to use
@@ -137,14 +164,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
137 * lock as other threads could have access to ssec */ 164 * lock as other threads could have access to ssec */
138 rcu_read_lock(); 165 rcu_read_lock();
139 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); 166 selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
140 newssec->sclass = ssec->sclass;
141 rcu_read_unlock(); 167 rcu_read_unlock();
142} 168}
143 169
144/** 170/**
145 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel 171 * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
146 * @skb: the packet 172 * @skb: the packet
147 * @base_sid: the SELinux SID to use as a context for MLS only attributes 173 * @family: protocol family
174 * @type: NetLabel labeling protocol type
148 * @sid: the SID 175 * @sid: the SID
149 * 176 *
150 * Description: 177 * Description:
@@ -153,7 +180,10 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
153 * assign to the packet. Returns zero on success, negative values on failure. 180 * assign to the packet. Returns zero on success, negative values on failure.
154 * 181 *
155 */ 182 */
156int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) 183int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
184 u16 family,
185 u32 *type,
186 u32 *sid)
157{ 187{
158 int rc; 188 int rc;
159 struct netlbl_lsm_secattr secattr; 189 struct netlbl_lsm_secattr secattr;
@@ -164,15 +194,12 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
164 } 194 }
165 195
166 netlbl_secattr_init(&secattr); 196 netlbl_secattr_init(&secattr);
167 rc = netlbl_skbuff_getattr(skb, &secattr); 197 rc = netlbl_skbuff_getattr(skb, family, &secattr);
168 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { 198 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
169 rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid); 199 rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
170 if (rc == 0 && 200 else
171 (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
172 (secattr.flags & NETLBL_SECATTR_CACHE))
173 netlbl_cache_add(skb, &secattr);
174 } else
175 *sid = SECSID_NULL; 201 *sid = SECSID_NULL;
202 *type = secattr.type;
176 netlbl_secattr_destroy(&secattr); 203 netlbl_secattr_destroy(&secattr);
177 204
178 return rc; 205 return rc;
@@ -190,13 +217,10 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
190 */ 217 */
191void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) 218void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
192{ 219{
193 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
194 struct sk_security_struct *sksec = sk->sk_security; 220 struct sk_security_struct *sksec = sk->sk_security;
195 struct netlbl_lsm_secattr secattr; 221 struct netlbl_lsm_secattr secattr;
196 u32 nlbl_peer_sid; 222 u32 nlbl_peer_sid;
197 223
198 sksec->sclass = isec->sclass;
199
200 rcu_read_lock(); 224 rcu_read_lock();
201 225
202 if (sksec->nlbl_state != NLBL_REQUIRE) { 226 if (sksec->nlbl_state != NLBL_REQUIRE) {
@@ -207,9 +231,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
207 netlbl_secattr_init(&secattr); 231 netlbl_secattr_init(&secattr);
208 if (netlbl_sock_getattr(sk, &secattr) == 0 && 232 if (netlbl_sock_getattr(sk, &secattr) == 0 &&
209 secattr.flags != NETLBL_SECATTR_NONE && 233 secattr.flags != NETLBL_SECATTR_NONE &&
210 security_netlbl_secattr_to_sid(&secattr, 234 security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0)
211 SECINITSID_NETMSG,
212 &nlbl_peer_sid) == 0)
213 sksec->peer_sid = nlbl_peer_sid; 235 sksec->peer_sid = nlbl_peer_sid;
214 netlbl_secattr_destroy(&secattr); 236 netlbl_secattr_destroy(&secattr);
215 237
@@ -234,11 +256,8 @@ int selinux_netlbl_socket_post_create(struct socket *sock)
234{ 256{
235 int rc = 0; 257 int rc = 0;
236 struct sock *sk = sock->sk; 258 struct sock *sk = sock->sk;
237 struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
238 struct sk_security_struct *sksec = sk->sk_security; 259 struct sk_security_struct *sksec = sk->sk_security;
239 260
240 sksec->sclass = isec->sclass;
241
242 rcu_read_lock(); 261 rcu_read_lock();
243 if (sksec->nlbl_state == NLBL_REQUIRE) 262 if (sksec->nlbl_state == NLBL_REQUIRE)
244 rc = selinux_netlbl_sock_setsid(sk, sksec->sid); 263 rc = selinux_netlbl_sock_setsid(sk, sksec->sid);
@@ -292,6 +311,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
292 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel 311 * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
293 * @sksec: the sock's sk_security_struct 312 * @sksec: the sock's sk_security_struct
294 * @skb: the packet 313 * @skb: the packet
314 * @family: protocol family
295 * @ad: the audit data 315 * @ad: the audit data
296 * 316 *
297 * Description: 317 * Description:
@@ -302,6 +322,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
302 */ 322 */
303int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, 323int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
304 struct sk_buff *skb, 324 struct sk_buff *skb,
325 u16 family,
305 struct avc_audit_data *ad) 326 struct avc_audit_data *ad)
306{ 327{
307 int rc; 328 int rc;
@@ -313,16 +334,10 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
313 return 0; 334 return 0;
314 335
315 netlbl_secattr_init(&secattr); 336 netlbl_secattr_init(&secattr);
316 rc = netlbl_skbuff_getattr(skb, &secattr); 337 rc = netlbl_skbuff_getattr(skb, family, &secattr);
317 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { 338 if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
318 rc = security_netlbl_secattr_to_sid(&secattr, 339 rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
319 SECINITSID_NETMSG, 340 else
320 &nlbl_sid);
321 if (rc == 0 &&
322 (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
323 (secattr.flags & NETLBL_SECATTR_CACHE))
324 netlbl_cache_add(skb, &secattr);
325 } else
326 nlbl_sid = SECINITSID_UNLABELED; 341 nlbl_sid = SECINITSID_UNLABELED;
327 netlbl_secattr_destroy(&secattr); 342 netlbl_secattr_destroy(&secattr);
328 if (rc != 0) 343 if (rc != 0)
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c
new file mode 100644
index 000000000000..f3c526f2cacb
--- /dev/null
+++ b/security/selinux/netnode.c
@@ -0,0 +1,354 @@
1/*
2 * Network node table
3 *
4 * SELinux must keep a mapping of network nodes to labels/SIDs. This
5 * mapping is maintained as part of the normal policy but a fast cache is
6 * needed to reduce the lookup overhead since most of these queries happen on
7 * a per-packet basis.
8 *
9 * Author: Paul Moore <paul.moore@hp.com>
10 *
11 * This code is heavily based on the "netif" concept originally developed by
12 * James Morris <jmorris@redhat.com>
13 * (see security/selinux/netif.c for more information)
14 *
15 */
16
17/*
18 * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
19 *
20 * This program is free software: you can redistribute it and/or modify
21 * it under the terms of version 2 of the GNU General Public License as
22 * published by the Free Software Foundation.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 */
30
31#include <linux/types.h>
32#include <linux/rcupdate.h>
33#include <linux/list.h>
34#include <linux/spinlock.h>
35#include <linux/in.h>
36#include <linux/in6.h>
37#include <linux/ip.h>
38#include <linux/ipv6.h>
39#include <net/ip.h>
40#include <net/ipv6.h>
41#include <asm/bug.h>
42
43#include "objsec.h"
44
45#define SEL_NETNODE_HASH_SIZE 256
46#define SEL_NETNODE_HASH_BKT_LIMIT 16
47
48struct sel_netnode {
49 struct netnode_security_struct nsec;
50
51 struct list_head list;
52 struct rcu_head rcu;
53};
54
55/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
56 * for this is that I suspect most users will not make heavy use of both
57 * address families at the same time so one table will usually end up wasted,
58 * if this becomes a problem we can always add a hash table for each address
59 * family later */
60
61static LIST_HEAD(sel_netnode_list);
62static DEFINE_SPINLOCK(sel_netnode_lock);
63static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE];
64
65/**
66 * sel_netnode_free - Frees a node entry
67 * @p: the entry's RCU field
68 *
69 * Description:
70 * This function is designed to be used as a callback to the call_rcu()
71 * function so that memory allocated to a hash table node entry can be
72 * released safely.
73 *
74 */
75static void sel_netnode_free(struct rcu_head *p)
76{
77 struct sel_netnode *node = container_of(p, struct sel_netnode, rcu);
78 kfree(node);
79}
80
81/**
82 * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table
83 * @addr: IPv4 address
84 *
85 * Description:
86 * This is the IPv4 hashing function for the node interface table, it returns
87 * the bucket number for the given IP address.
88 *
89 */
90static u32 sel_netnode_hashfn_ipv4(__be32 addr)
91{
92 /* at some point we should determine if the mismatch in byte order
93 * affects the hash function dramatically */
94 return (addr & (SEL_NETNODE_HASH_SIZE - 1));
95}
96
97/**
98 * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table
99 * @addr: IPv6 address
100 *
101 * Description:
102 * This is the IPv6 hashing function for the node interface table, it returns
103 * the bucket number for the given IP address.
104 *
105 */
106static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr)
107{
108 /* just hash the least significant 32 bits to keep things fast (they
109 * are the most likely to be different anyway), we can revisit this
110 * later if needed */
111 return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1));
112}
113
114/**
115 * sel_netnode_find - Search for a node record
116 * @addr: IP address
117 * @family: address family
118 *
119 * Description:
120 * Search the network node table and return the record matching @addr. If an
121 * entry can not be found in the table return NULL.
122 *
123 */
124static struct sel_netnode *sel_netnode_find(const void *addr, u16 family)
125{
126 u32 idx;
127 struct sel_netnode *node;
128
129 switch (family) {
130 case PF_INET:
131 idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr);
132 break;
133 case PF_INET6:
134 idx = sel_netnode_hashfn_ipv6(addr);
135 break;
136 default:
137 BUG();
138 }
139
140 list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list)
141 if (node->nsec.family == family)
142 switch (family) {
143 case PF_INET:
144 if (node->nsec.addr.ipv4 == *(__be32 *)addr)
145 return node;
146 break;
147 case PF_INET6:
148 if (ipv6_addr_equal(&node->nsec.addr.ipv6,
149 addr))
150 return node;
151 break;
152 }
153
154 return NULL;
155}
156
157/**
158 * sel_netnode_insert - Insert a new node into the table
159 * @node: the new node record
160 *
161 * Description:
162 * Add a new node record to the network address hash table. Returns zero on
163 * success, negative values on failure.
164 *
165 */
166static int sel_netnode_insert(struct sel_netnode *node)
167{
168 u32 idx;
169 u32 count = 0;
170 struct sel_netnode *iter;
171
172 switch (node->nsec.family) {
173 case PF_INET:
174 idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4);
175 break;
176 case PF_INET6:
177 idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6);
178 break;
179 default:
180 BUG();
181 }
182 list_add_rcu(&node->list, &sel_netnode_hash[idx]);
183
184 /* we need to impose a limit on the growth of the hash table so check
185 * this bucket to make sure it is within the specified bounds */
186 list_for_each_entry(iter, &sel_netnode_hash[idx], list)
187 if (++count > SEL_NETNODE_HASH_BKT_LIMIT) {
188 list_del_rcu(&iter->list);
189 call_rcu(&iter->rcu, sel_netnode_free);
190 break;
191 }
192
193 return 0;
194}
195
196/**
197 * sel_netnode_destroy - Remove a node record from the table
198 * @node: the existing node record
199 *
200 * Description:
201 * Remove an existing node record from the network address table.
202 *
203 */
204static void sel_netnode_destroy(struct sel_netnode *node)
205{
206 list_del_rcu(&node->list);
207 call_rcu(&node->rcu, sel_netnode_free);
208}
209
210/**
211 * sel_netnode_sid_slow - Lookup the SID of a network address using the policy
212 * @addr: the IP address
213 * @family: the address family
214 * @sid: node SID
215 *
216 * Description:
217 * This function determines the SID of a network address by quering the
218 * security policy. The result is added to the network address table to
219 * speedup future queries. Returns zero on success, negative values on
220 * failure.
221 *
222 */
223static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
224{
225 int ret;
226 struct sel_netnode *node;
227 struct sel_netnode *new = NULL;
228
229 spin_lock_bh(&sel_netnode_lock);
230 node = sel_netnode_find(addr, family);
231 if (node != NULL) {
232 *sid = node->nsec.sid;
233 ret = 0;
234 goto out;
235 }
236 new = kzalloc(sizeof(*new), GFP_ATOMIC);
237 if (new == NULL) {
238 ret = -ENOMEM;
239 goto out;
240 }
241 switch (family) {
242 case PF_INET:
243 ret = security_node_sid(PF_INET,
244 addr, sizeof(struct in_addr),
245 &new->nsec.sid);
246 new->nsec.addr.ipv4 = *(__be32 *)addr;
247 break;
248 case PF_INET6:
249 ret = security_node_sid(PF_INET6,
250 addr, sizeof(struct in6_addr),
251 &new->nsec.sid);
252 ipv6_addr_copy(&new->nsec.addr.ipv6, addr);
253 break;
254 default:
255 BUG();
256 }
257 if (ret != 0)
258 goto out;
259 new->nsec.family = family;
260 ret = sel_netnode_insert(new);
261 if (ret != 0)
262 goto out;
263 *sid = new->nsec.sid;
264
265out:
266 spin_unlock_bh(&sel_netnode_lock);
267 if (unlikely(ret)) {
268 printk(KERN_WARNING
269 "SELinux: failure in sel_netnode_sid_slow(),"
270 " unable to determine network node label\n");
271 kfree(new);
272 }
273 return ret;
274}
275
276/**
277 * sel_netnode_sid - Lookup the SID of a network address
278 * @addr: the IP address
279 * @family: the address family
280 * @sid: node SID
281 *
282 * Description:
283 * This function determines the SID of a network address using the fastest
284 * method possible. First the address table is queried, but if an entry
285 * can't be found then the policy is queried and the result is added to the
286 * table to speedup future queries. Returns zero on success, negative values
287 * on failure.
288 *
289 */
290int sel_netnode_sid(void *addr, u16 family, u32 *sid)
291{
292 struct sel_netnode *node;
293
294 rcu_read_lock();
295 node = sel_netnode_find(addr, family);
296 if (node != NULL) {
297 *sid = node->nsec.sid;
298 rcu_read_unlock();
299 return 0;
300 }
301 rcu_read_unlock();
302
303 return sel_netnode_sid_slow(addr, family, sid);
304}
305
306/**
307 * sel_netnode_flush - Flush the entire network address table
308 *
309 * Description:
310 * Remove all entries from the network address table.
311 *
312 */
313static void sel_netnode_flush(void)
314{
315 u32 idx;
316 struct sel_netnode *node;
317
318 spin_lock_bh(&sel_netnode_lock);
319 for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++)
320 list_for_each_entry(node, &sel_netnode_hash[idx], list)
321 sel_netnode_destroy(node);
322 spin_unlock_bh(&sel_netnode_lock);
323}
324
325static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid,
326 u16 class, u32 perms, u32 *retained)
327{
328 if (event == AVC_CALLBACK_RESET) {
329 sel_netnode_flush();
330 synchronize_net();
331 }
332 return 0;
333}
334
335static __init int sel_netnode_init(void)
336{
337 int iter;
338 int ret;
339
340 if (!selinux_enabled)
341 return 0;
342
343 for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++)
344 INIT_LIST_HEAD(&sel_netnode_hash[iter]);
345
346 ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET,
347 SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
348 if (ret != 0)
349 panic("avc_add_callback() failed, error %d\n", ret);
350
351 return ret;
352}
353
354__initcall(sel_netnode_init);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 397fd4955fe1..a85740530afc 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -2,6 +2,11 @@
2 * 2 *
3 * Added conditional policy language extensions 3 * Added conditional policy language extensions
4 * 4 *
5 * Updated: Hewlett-Packard <paul.moore@hp.com>
6 *
7 * Added support for the policy capability bitmap
8 *
9 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
5 * Copyright (C) 2003 - 2004 Tresys Technology, LLC 10 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
6 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 11 * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
7 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
@@ -35,6 +40,11 @@
35#include "objsec.h" 40#include "objsec.h"
36#include "conditional.h" 41#include "conditional.h"
37 42
43/* Policy capability filenames */
44static char *policycap_names[] = {
45 "network_peer_controls"
46};
47
38unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; 48unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
39 49
40#ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT 50#ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
@@ -72,6 +82,9 @@ static int *bool_pending_values = NULL;
72static struct dentry *class_dir = NULL; 82static struct dentry *class_dir = NULL;
73static unsigned long last_class_ino; 83static unsigned long last_class_ino;
74 84
85/* global data for policy capabilities */
86static struct dentry *policycap_dir = NULL;
87
75extern void selnl_notify_setenforce(int val); 88extern void selnl_notify_setenforce(int val);
76 89
77/* Check whether a task is allowed to use a security operation. */ 90/* Check whether a task is allowed to use a security operation. */
@@ -111,10 +124,11 @@ enum sel_inos {
111 124
112static unsigned long sel_last_ino = SEL_INO_NEXT - 1; 125static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
113 126
114#define SEL_INITCON_INO_OFFSET 0x01000000 127#define SEL_INITCON_INO_OFFSET 0x01000000
115#define SEL_BOOL_INO_OFFSET 0x02000000 128#define SEL_BOOL_INO_OFFSET 0x02000000
116#define SEL_CLASS_INO_OFFSET 0x04000000 129#define SEL_CLASS_INO_OFFSET 0x04000000
117#define SEL_INO_MASK 0x00ffffff 130#define SEL_POLICYCAP_INO_OFFSET 0x08000000
131#define SEL_INO_MASK 0x00ffffff
118 132
119#define TMPBUFLEN 12 133#define TMPBUFLEN 12
120static ssize_t sel_read_enforce(struct file *filp, char __user *buf, 134static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
@@ -263,6 +277,7 @@ static const struct file_operations sel_policyvers_ops = {
263/* declaration for sel_write_load */ 277/* declaration for sel_write_load */
264static int sel_make_bools(void); 278static int sel_make_bools(void);
265static int sel_make_classes(void); 279static int sel_make_classes(void);
280static int sel_make_policycap(void);
266 281
267/* declaration for sel_make_class_dirs */ 282/* declaration for sel_make_class_dirs */
268static int sel_make_dir(struct inode *dir, struct dentry *dentry, 283static int sel_make_dir(struct inode *dir, struct dentry *dentry,
@@ -323,6 +338,12 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
323 } 338 }
324 339
325 ret = sel_make_classes(); 340 ret = sel_make_classes();
341 if (ret) {
342 length = ret;
343 goto out1;
344 }
345
346 ret = sel_make_policycap();
326 if (ret) 347 if (ret)
327 length = ret; 348 length = ret;
328 else 349 else
@@ -1399,6 +1420,24 @@ static const struct file_operations sel_perm_ops = {
1399 .read = sel_read_perm, 1420 .read = sel_read_perm,
1400}; 1421};
1401 1422
1423static ssize_t sel_read_policycap(struct file *file, char __user *buf,
1424 size_t count, loff_t *ppos)
1425{
1426 int value;
1427 char tmpbuf[TMPBUFLEN];
1428 ssize_t length;
1429 unsigned long i_ino = file->f_path.dentry->d_inode->i_ino;
1430
1431 value = security_policycap_supported(i_ino & SEL_INO_MASK);
1432 length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value);
1433
1434 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1435}
1436
1437static const struct file_operations sel_policycap_ops = {
1438 .read = sel_read_policycap,
1439};
1440
1402static int sel_make_perm_files(char *objclass, int classvalue, 1441static int sel_make_perm_files(char *objclass, int classvalue,
1403 struct dentry *dir) 1442 struct dentry *dir)
1404{ 1443{
@@ -1545,6 +1584,36 @@ out:
1545 return rc; 1584 return rc;
1546} 1585}
1547 1586
1587static int sel_make_policycap(void)
1588{
1589 unsigned int iter;
1590 struct dentry *dentry = NULL;
1591 struct inode *inode = NULL;
1592
1593 sel_remove_entries(policycap_dir);
1594
1595 for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
1596 if (iter < ARRAY_SIZE(policycap_names))
1597 dentry = d_alloc_name(policycap_dir,
1598 policycap_names[iter]);
1599 else
1600 dentry = d_alloc_name(policycap_dir, "unknown");
1601
1602 if (dentry == NULL)
1603 return -ENOMEM;
1604
1605 inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO);
1606 if (inode == NULL)
1607 return -ENOMEM;
1608
1609 inode->i_fop = &sel_policycap_ops;
1610 inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET;
1611 d_add(dentry, inode);
1612 }
1613
1614 return 0;
1615}
1616
1548static int sel_make_dir(struct inode *dir, struct dentry *dentry, 1617static int sel_make_dir(struct inode *dir, struct dentry *dentry,
1549 unsigned long *ino) 1618 unsigned long *ino)
1550{ 1619{
@@ -1673,6 +1742,18 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
1673 1742
1674 class_dir = dentry; 1743 class_dir = dentry;
1675 1744
1745 dentry = d_alloc_name(sb->s_root, "policy_capabilities");
1746 if (!dentry) {
1747 ret = -ENOMEM;
1748 goto err;
1749 }
1750
1751 ret = sel_make_dir(root_inode, dentry, &sel_last_ino);
1752 if (ret)
1753 goto err;
1754
1755 policycap_dir = dentry;
1756
1676out: 1757out:
1677 return ret; 1758 return ret;
1678err: 1759err:
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index 3bbcb5369af9..feaf0a5b828f 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context,
562 if (!selinux_mls_enabled) 562 if (!selinux_mls_enabled)
563 return; 563 return;
564 564
565 secattr->mls_lvl = context->range.level[0].sens - 1; 565 secattr->attr.mls.lvl = context->range.level[0].sens - 1;
566 secattr->flags |= NETLBL_SECATTR_MLS_LVL; 566 secattr->flags |= NETLBL_SECATTR_MLS_LVL;
567} 567}
568 568
@@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context,
582 if (!selinux_mls_enabled) 582 if (!selinux_mls_enabled)
583 return; 583 return;
584 584
585 context->range.level[0].sens = secattr->mls_lvl + 1; 585 context->range.level[0].sens = secattr->attr.mls.lvl + 1;
586 context->range.level[1].sens = context->range.level[0].sens; 586 context->range.level[1].sens = context->range.level[0].sens;
587} 587}
588 588
@@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context,
605 return 0; 605 return 0;
606 606
607 rc = ebitmap_netlbl_export(&context->range.level[0].cat, 607 rc = ebitmap_netlbl_export(&context->range.level[0].cat,
608 &secattr->mls_cat); 608 &secattr->attr.mls.cat);
609 if (rc == 0 && secattr->mls_cat != NULL) 609 if (rc == 0 && secattr->attr.mls.cat != NULL)
610 secattr->flags |= NETLBL_SECATTR_MLS_CAT; 610 secattr->flags |= NETLBL_SECATTR_MLS_CAT;
611 611
612 return rc; 612 return rc;
@@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context,
633 return 0; 633 return 0;
634 634
635 rc = ebitmap_netlbl_import(&context->range.level[0].cat, 635 rc = ebitmap_netlbl_import(&context->range.level[0].cat,
636 secattr->mls_cat); 636 secattr->attr.mls.cat);
637 if (rc != 0) 637 if (rc != 0)
638 goto import_netlbl_cat_failure; 638 goto import_netlbl_cat_failure;
639 639
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index b582aae3c62c..bd7d6a00342d 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -13,6 +13,11 @@
13 * 13 *
14 * Added conditional policy language extensions 14 * Added conditional policy language extensions
15 * 15 *
16 * Updated: Hewlett-Packard <paul.moore@hp.com>
17 *
18 * Added support for the policy capability bitmap
19 *
20 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
16 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. 21 * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
17 * Copyright (C) 2003 - 2004 Tresys Technology, LLC 22 * Copyright (C) 2003 - 2004 Tresys Technology, LLC
18 * This program is free software; you can redistribute it and/or modify 23 * This program is free software; you can redistribute it and/or modify
@@ -102,6 +107,11 @@ static struct policydb_compat_info policydb_compat[] = {
102 .sym_num = SYM_NUM, 107 .sym_num = SYM_NUM,
103 .ocon_num = OCON_NUM, 108 .ocon_num = OCON_NUM,
104 }, 109 },
110 {
111 .version = POLICYDB_VERSION_POLCAP,
112 .sym_num = SYM_NUM,
113 .ocon_num = OCON_NUM,
114 }
105}; 115};
106 116
107static struct policydb_compat_info *policydb_lookup_compat(int version) 117static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -183,6 +193,8 @@ static int policydb_init(struct policydb *p)
183 if (rc) 193 if (rc)
184 goto out_free_symtab; 194 goto out_free_symtab;
185 195
196 ebitmap_init(&p->policycaps);
197
186out: 198out:
187 return rc; 199 return rc;
188 200
@@ -673,8 +685,8 @@ void policydb_destroy(struct policydb *p)
673 ebitmap_destroy(&p->type_attr_map[i]); 685 ebitmap_destroy(&p->type_attr_map[i]);
674 } 686 }
675 kfree(p->type_attr_map); 687 kfree(p->type_attr_map);
676
677 kfree(p->undefined_perms); 688 kfree(p->undefined_perms);
689 ebitmap_destroy(&p->policycaps);
678 690
679 return; 691 return;
680} 692}
@@ -1554,6 +1566,10 @@ int policydb_read(struct policydb *p, void *fp)
1554 p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); 1566 p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
1555 p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); 1567 p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);
1556 1568
1569 if (p->policyvers >= POLICYDB_VERSION_POLCAP &&
1570 ebitmap_read(&p->policycaps, fp) != 0)
1571 goto bad;
1572
1557 info = policydb_lookup_compat(p->policyvers); 1573 info = policydb_lookup_compat(p->policyvers);
1558 if (!info) { 1574 if (!info) {
1559 printk(KERN_ERR "security: unable to find policy compat info " 1575 printk(KERN_ERR "security: unable to find policy compat info "
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index ed6fc687c66f..c4ce996e202c 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -241,6 +241,8 @@ struct policydb {
241 /* type -> attribute reverse mapping */ 241 /* type -> attribute reverse mapping */
242 struct ebitmap *type_attr_map; 242 struct ebitmap *type_attr_map;
243 243
244 struct ebitmap policycaps;
245
244 unsigned int policyvers; 246 unsigned int policyvers;
245 247
246 unsigned int reject_unknown : 1; 248 unsigned int reject_unknown : 1;
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 4bf715d4cf29..f96dec1f9258 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -16,12 +16,13 @@
16 * Updated: Hewlett-Packard <paul.moore@hp.com> 16 * Updated: Hewlett-Packard <paul.moore@hp.com>
17 * 17 *
18 * Added support for NetLabel 18 * Added support for NetLabel
19 * Added support for the policy capability bitmap
19 * 20 *
20 * Updated: Chad Sellers <csellers@tresys.com> 21 * Updated: Chad Sellers <csellers@tresys.com>
21 * 22 *
22 * Added validation of kernel classes and permissions 23 * Added validation of kernel classes and permissions
23 * 24 *
24 * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. 25 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
25 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. 26 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
26 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC 27 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
27 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> 28 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -59,6 +60,8 @@
59extern void selnl_notify_policyload(u32 seqno); 60extern void selnl_notify_policyload(u32 seqno);
60unsigned int policydb_loaded_version; 61unsigned int policydb_loaded_version;
61 62
63int selinux_policycap_netpeer;
64
62/* 65/*
63 * This is declared in avc.c 66 * This is declared in avc.c
64 */ 67 */
@@ -1299,6 +1302,12 @@ bad:
1299 goto out; 1302 goto out;
1300} 1303}
1301 1304
1305static void security_load_policycaps(void)
1306{
1307 selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps,
1308 POLICYDB_CAPABILITY_NETPEER);
1309}
1310
1302extern void selinux_complete_init(void); 1311extern void selinux_complete_init(void);
1303static int security_preserve_bools(struct policydb *p); 1312static int security_preserve_bools(struct policydb *p);
1304 1313
@@ -1346,6 +1355,7 @@ int security_load_policy(void *data, size_t len)
1346 avtab_cache_destroy(); 1355 avtab_cache_destroy();
1347 return -EINVAL; 1356 return -EINVAL;
1348 } 1357 }
1358 security_load_policycaps();
1349 policydb_loaded_version = policydb.policyvers; 1359 policydb_loaded_version = policydb.policyvers;
1350 ss_initialized = 1; 1360 ss_initialized = 1;
1351 seqno = ++latest_granting; 1361 seqno = ++latest_granting;
@@ -1404,6 +1414,7 @@ int security_load_policy(void *data, size_t len)
1404 POLICY_WRLOCK; 1414 POLICY_WRLOCK;
1405 memcpy(&policydb, &newpolicydb, sizeof policydb); 1415 memcpy(&policydb, &newpolicydb, sizeof policydb);
1406 sidtab_set(&sidtab, &newsidtab); 1416 sidtab_set(&sidtab, &newsidtab);
1417 security_load_policycaps();
1407 seqno = ++latest_granting; 1418 seqno = ++latest_granting;
1408 policydb_loaded_version = policydb.policyvers; 1419 policydb_loaded_version = policydb.policyvers;
1409 POLICY_WRUNLOCK; 1420 POLICY_WRUNLOCK;
@@ -1478,11 +1489,8 @@ out:
1478 * security_netif_sid - Obtain the SID for a network interface. 1489 * security_netif_sid - Obtain the SID for a network interface.
1479 * @name: interface name 1490 * @name: interface name
1480 * @if_sid: interface SID 1491 * @if_sid: interface SID
1481 * @msg_sid: default SID for received packets
1482 */ 1492 */
1483int security_netif_sid(char *name, 1493int security_netif_sid(char *name, u32 *if_sid)
1484 u32 *if_sid,
1485 u32 *msg_sid)
1486{ 1494{
1487 int rc = 0; 1495 int rc = 0;
1488 struct ocontext *c; 1496 struct ocontext *c;
@@ -1510,11 +1518,8 @@ int security_netif_sid(char *name,
1510 goto out; 1518 goto out;
1511 } 1519 }
1512 *if_sid = c->sid[0]; 1520 *if_sid = c->sid[0];
1513 *msg_sid = c->sid[1]; 1521 } else
1514 } else {
1515 *if_sid = SECINITSID_NETIF; 1522 *if_sid = SECINITSID_NETIF;
1516 *msg_sid = SECINITSID_NETMSG;
1517 }
1518 1523
1519out: 1524out:
1520 POLICY_RDUNLOCK; 1525 POLICY_RDUNLOCK;
@@ -2049,6 +2054,91 @@ out:
2049 return rc; 2054 return rc;
2050} 2055}
2051 2056
2057/**
2058 * security_net_peersid_resolve - Compare and resolve two network peer SIDs
2059 * @nlbl_sid: NetLabel SID
2060 * @nlbl_type: NetLabel labeling protocol type
2061 * @xfrm_sid: XFRM SID
2062 *
2063 * Description:
2064 * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
2065 * resolved into a single SID it is returned via @peer_sid and the function
2066 * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function
2067 * returns a negative value. A table summarizing the behavior is below:
2068 *
2069 * | function return | @sid
2070 * ------------------------------+-----------------+-----------------
2071 * no peer labels | 0 | SECSID_NULL
2072 * single peer label | 0 | <peer_label>
2073 * multiple, consistent labels | 0 | <peer_label>
2074 * multiple, inconsistent labels | -<errno> | SECSID_NULL
2075 *
2076 */
2077int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
2078 u32 xfrm_sid,
2079 u32 *peer_sid)
2080{
2081 int rc;
2082 struct context *nlbl_ctx;
2083 struct context *xfrm_ctx;
2084
2085 /* handle the common (which also happens to be the set of easy) cases
2086 * right away, these two if statements catch everything involving a
2087 * single or absent peer SID/label */
2088 if (xfrm_sid == SECSID_NULL) {
2089 *peer_sid = nlbl_sid;
2090 return 0;
2091 }
2092 /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
2093 * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
2094 * is present */
2095 if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
2096 *peer_sid = xfrm_sid;
2097 return 0;
2098 }
2099
2100 /* we don't need to check ss_initialized here since the only way both
2101 * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
2102 * security server was initialized and ss_initialized was true */
2103 if (!selinux_mls_enabled) {
2104 *peer_sid = SECSID_NULL;
2105 return 0;
2106 }
2107
2108 POLICY_RDLOCK;
2109
2110 nlbl_ctx = sidtab_search(&sidtab, nlbl_sid);
2111 if (!nlbl_ctx) {
2112 printk(KERN_ERR
2113 "security_sid_mls_cmp: unrecognized SID %d\n",
2114 nlbl_sid);
2115 rc = -EINVAL;
2116 goto out_slowpath;
2117 }
2118 xfrm_ctx = sidtab_search(&sidtab, xfrm_sid);
2119 if (!xfrm_ctx) {
2120 printk(KERN_ERR
2121 "security_sid_mls_cmp: unrecognized SID %d\n",
2122 xfrm_sid);
2123 rc = -EINVAL;
2124 goto out_slowpath;
2125 }
2126 rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
2127
2128out_slowpath:
2129 POLICY_RDUNLOCK;
2130 if (rc == 0)
2131 /* at present NetLabel SIDs/labels really only carry MLS
2132 * information so if the MLS portion of the NetLabel SID
2133 * matches the MLS portion of the labeled XFRM SID/label
2134 * then pass along the XFRM SID as it is the most
2135 * expressive */
2136 *peer_sid = xfrm_sid;
2137 else
2138 *peer_sid = SECSID_NULL;
2139 return rc;
2140}
2141
2052static int get_classes_callback(void *k, void *d, void *args) 2142static int get_classes_callback(void *k, void *d, void *args)
2053{ 2143{
2054 struct class_datum *datum = d; 2144 struct class_datum *datum = d;
@@ -2154,6 +2244,60 @@ int security_get_allow_unknown(void)
2154 return policydb.allow_unknown; 2244 return policydb.allow_unknown;
2155} 2245}
2156 2246
2247/**
2248 * security_get_policycaps - Query the loaded policy for its capabilities
2249 * @len: the number of capability bits
2250 * @values: the capability bit array
2251 *
2252 * Description:
2253 * Get an array of the policy capabilities in @values where each entry in
2254 * @values is either true (1) or false (0) depending the policy's support of
2255 * that feature. The policy capabilities are defined by the
2256 * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it
2257 * is up to the caller to free the array in @values. Returns zero on success,
2258 * negative values on failure.
2259 *
2260 */
2261int security_get_policycaps(int *len, int **values)
2262{
2263 int rc = -ENOMEM;
2264 unsigned int iter;
2265
2266 POLICY_RDLOCK;
2267
2268 *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC);
2269 if (*values == NULL)
2270 goto out;
2271 for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++)
2272 (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter);
2273 *len = POLICYDB_CAPABILITY_MAX;
2274
2275out:
2276 POLICY_RDUNLOCK;
2277 return rc;
2278}
2279
2280/**
2281 * security_policycap_supported - Check for a specific policy capability
2282 * @req_cap: capability
2283 *
2284 * Description:
2285 * This function queries the currently loaded policy to see if it supports the
2286 * capability specified by @req_cap. Returns true (1) if the capability is
2287 * supported, false (0) if it isn't supported.
2288 *
2289 */
2290int security_policycap_supported(unsigned int req_cap)
2291{
2292 int rc;
2293
2294 POLICY_RDLOCK;
2295 rc = ebitmap_get_bit(&policydb.policycaps, req_cap);
2296 POLICY_RDUNLOCK;
2297
2298 return rc;
2299}
2300
2157struct selinux_audit_rule { 2301struct selinux_audit_rule {
2158 u32 au_seqno; 2302 u32 au_seqno;
2159 struct context au_ctxt; 2303 struct context au_ctxt;
@@ -2403,50 +2547,10 @@ void selinux_audit_set_callback(int (*callback)(void))
2403} 2547}
2404 2548
2405#ifdef CONFIG_NETLABEL 2549#ifdef CONFIG_NETLABEL
2406/*
2407 * NetLabel cache structure
2408 */
2409#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x))
2410#define NETLBL_CACHE_T_NONE 0
2411#define NETLBL_CACHE_T_SID 1
2412#define NETLBL_CACHE_T_MLS 2
2413struct selinux_netlbl_cache {
2414 u32 type;
2415 union {
2416 u32 sid;
2417 struct mls_range mls_label;
2418 } data;
2419};
2420
2421/**
2422 * security_netlbl_cache_free - Free the NetLabel cached data
2423 * @data: the data to free
2424 *
2425 * Description:
2426 * This function is intended to be used as the free() callback inside the
2427 * netlbl_lsm_cache structure.
2428 *
2429 */
2430static void security_netlbl_cache_free(const void *data)
2431{
2432 struct selinux_netlbl_cache *cache;
2433
2434 if (data == NULL)
2435 return;
2436
2437 cache = NETLBL_CACHE(data);
2438 switch (cache->type) {
2439 case NETLBL_CACHE_T_MLS:
2440 ebitmap_destroy(&cache->data.mls_label.level[0].cat);
2441 break;
2442 }
2443 kfree(data);
2444}
2445
2446/** 2550/**
2447 * security_netlbl_cache_add - Add an entry to the NetLabel cache 2551 * security_netlbl_cache_add - Add an entry to the NetLabel cache
2448 * @secattr: the NetLabel packet security attributes 2552 * @secattr: the NetLabel packet security attributes
2449 * @ctx: the SELinux context 2553 * @sid: the SELinux SID
2450 * 2554 *
2451 * Description: 2555 * Description:
2452 * Attempt to cache the context in @ctx, which was derived from the packet in 2556 * Attempt to cache the context in @ctx, which was derived from the packet in
@@ -2455,60 +2559,46 @@ static void security_netlbl_cache_free(const void *data)
2455 * 2559 *
2456 */ 2560 */
2457static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, 2561static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
2458 struct context *ctx) 2562 u32 sid)
2459{ 2563{
2460 struct selinux_netlbl_cache *cache = NULL; 2564 u32 *sid_cache;
2461 2565
2462 secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); 2566 sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
2463 if (secattr->cache == NULL) 2567 if (sid_cache == NULL)
2464 return;
2465
2466 cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
2467 if (cache == NULL)
2468 return; 2568 return;
2469 2569 secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
2470 cache->type = NETLBL_CACHE_T_MLS; 2570 if (secattr->cache == NULL) {
2471 if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, 2571 kfree(sid_cache);
2472 &ctx->range.level[0].cat) != 0) {
2473 kfree(cache);
2474 return; 2572 return;
2475 } 2573 }
2476 cache->data.mls_label.level[1].cat.highbit =
2477 cache->data.mls_label.level[0].cat.highbit;
2478 cache->data.mls_label.level[1].cat.node =
2479 cache->data.mls_label.level[0].cat.node;
2480 cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
2481 cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
2482 2574
2483 secattr->cache->free = security_netlbl_cache_free; 2575 *sid_cache = sid;
2484 secattr->cache->data = (void *)cache; 2576 secattr->cache->free = kfree;
2577 secattr->cache->data = sid_cache;
2485 secattr->flags |= NETLBL_SECATTR_CACHE; 2578 secattr->flags |= NETLBL_SECATTR_CACHE;
2486} 2579}
2487 2580
2488/** 2581/**
2489 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID 2582 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
2490 * @secattr: the NetLabel packet security attributes 2583 * @secattr: the NetLabel packet security attributes
2491 * @base_sid: the SELinux SID to use as a context for MLS only attributes
2492 * @sid: the SELinux SID 2584 * @sid: the SELinux SID
2493 * 2585 *
2494 * Description: 2586 * Description:
2495 * Convert the given NetLabel security attributes in @secattr into a 2587 * Convert the given NetLabel security attributes in @secattr into a
2496 * SELinux SID. If the @secattr field does not contain a full SELinux 2588 * SELinux SID. If the @secattr field does not contain a full SELinux
2497 * SID/context then use the context in @base_sid as the foundation. If 2589 * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the
2498 * possibile the 'cache' field of @secattr is set and the CACHE flag is set; 2590 * 'cache' field of @secattr is set and the CACHE flag is set; this is to
2499 * this is to allow the @secattr to be used by NetLabel to cache the secattr to 2591 * allow the @secattr to be used by NetLabel to cache the secattr to SID
2500 * SID conversion for future lookups. Returns zero on success, negative 2592 * conversion for future lookups. Returns zero on success, negative values on
2501 * values on failure. 2593 * failure.
2502 * 2594 *
2503 */ 2595 */
2504int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, 2596int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2505 u32 base_sid,
2506 u32 *sid) 2597 u32 *sid)
2507{ 2598{
2508 int rc = -EIDRM; 2599 int rc = -EIDRM;
2509 struct context *ctx; 2600 struct context *ctx;
2510 struct context ctx_new; 2601 struct context ctx_new;
2511 struct selinux_netlbl_cache *cache;
2512 2602
2513 if (!ss_initialized) { 2603 if (!ss_initialized) {
2514 *sid = SECSID_NULL; 2604 *sid = SECSID_NULL;
@@ -2518,40 +2608,13 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2518 POLICY_RDLOCK; 2608 POLICY_RDLOCK;
2519 2609
2520 if (secattr->flags & NETLBL_SECATTR_CACHE) { 2610 if (secattr->flags & NETLBL_SECATTR_CACHE) {
2521 cache = NETLBL_CACHE(secattr->cache->data); 2611 *sid = *(u32 *)secattr->cache->data;
2522 switch (cache->type) { 2612 rc = 0;
2523 case NETLBL_CACHE_T_SID: 2613 } else if (secattr->flags & NETLBL_SECATTR_SECID) {
2524 *sid = cache->data.sid; 2614 *sid = secattr->attr.secid;
2525 rc = 0; 2615 rc = 0;
2526 break;
2527 case NETLBL_CACHE_T_MLS:
2528 ctx = sidtab_search(&sidtab, base_sid);
2529 if (ctx == NULL)
2530 goto netlbl_secattr_to_sid_return;
2531
2532 ctx_new.user = ctx->user;
2533 ctx_new.role = ctx->role;
2534 ctx_new.type = ctx->type;
2535 ctx_new.range.level[0].sens =
2536 cache->data.mls_label.level[0].sens;
2537 ctx_new.range.level[0].cat.highbit =
2538 cache->data.mls_label.level[0].cat.highbit;
2539 ctx_new.range.level[0].cat.node =
2540 cache->data.mls_label.level[0].cat.node;
2541 ctx_new.range.level[1].sens =
2542 cache->data.mls_label.level[1].sens;
2543 ctx_new.range.level[1].cat.highbit =
2544 cache->data.mls_label.level[1].cat.highbit;
2545 ctx_new.range.level[1].cat.node =
2546 cache->data.mls_label.level[1].cat.node;
2547
2548 rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
2549 break;
2550 default:
2551 goto netlbl_secattr_to_sid_return;
2552 }
2553 } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { 2616 } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
2554 ctx = sidtab_search(&sidtab, base_sid); 2617 ctx = sidtab_search(&sidtab, SECINITSID_NETMSG);
2555 if (ctx == NULL) 2618 if (ctx == NULL)
2556 goto netlbl_secattr_to_sid_return; 2619 goto netlbl_secattr_to_sid_return;
2557 2620
@@ -2561,7 +2624,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2561 mls_import_netlbl_lvl(&ctx_new, secattr); 2624 mls_import_netlbl_lvl(&ctx_new, secattr);
2562 if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { 2625 if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
2563 if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, 2626 if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat,
2564 secattr->mls_cat) != 0) 2627 secattr->attr.mls.cat) != 0)
2565 goto netlbl_secattr_to_sid_return; 2628 goto netlbl_secattr_to_sid_return;
2566 ctx_new.range.level[1].cat.highbit = 2629 ctx_new.range.level[1].cat.highbit =
2567 ctx_new.range.level[0].cat.highbit; 2630 ctx_new.range.level[0].cat.highbit;
@@ -2578,7 +2641,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
2578 if (rc != 0) 2641 if (rc != 0)
2579 goto netlbl_secattr_to_sid_return_cleanup; 2642 goto netlbl_secattr_to_sid_return_cleanup;
2580 2643
2581 security_netlbl_cache_add(secattr, &ctx_new); 2644 security_netlbl_cache_add(secattr, *sid);
2582 2645
2583 ebitmap_destroy(&ctx_new.range.level[0].cat); 2646 ebitmap_destroy(&ctx_new.range.level[0].cat);
2584 } else { 2647 } else {
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index e07603969033..7e158205d081 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -46,11 +46,14 @@
46#include <net/checksum.h> 46#include <net/checksum.h>
47#include <net/udp.h> 47#include <net/udp.h>
48#include <asm/semaphore.h> 48#include <asm/semaphore.h>
49#include <asm/atomic.h>
49 50
50#include "avc.h" 51#include "avc.h"
51#include "objsec.h" 52#include "objsec.h"
52#include "xfrm.h" 53#include "xfrm.h"
53 54
55/* Labeled XFRM instance counter */
56atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
54 57
55/* 58/*
56 * Returns true if an LSM/SELinux context 59 * Returns true if an LSM/SELinux context
@@ -293,6 +296,9 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
293 BUG_ON(!uctx); 296 BUG_ON(!uctx);
294 297
295 err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); 298 err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0);
299 if (err == 0)
300 atomic_inc(&selinux_xfrm_refcount);
301
296 return err; 302 return err;
297} 303}
298 304
@@ -340,10 +346,13 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
340 struct xfrm_sec_ctx *ctx = xp->security; 346 struct xfrm_sec_ctx *ctx = xp->security;
341 int rc = 0; 347 int rc = 0;
342 348
343 if (ctx) 349 if (ctx) {
344 rc = avc_has_perm(tsec->sid, ctx->ctx_sid, 350 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
345 SECCLASS_ASSOCIATION, 351 SECCLASS_ASSOCIATION,
346 ASSOCIATION__SETCONTEXT, NULL); 352 ASSOCIATION__SETCONTEXT, NULL);
353 if (rc == 0)
354 atomic_dec(&selinux_xfrm_refcount);
355 }
347 356
348 return rc; 357 return rc;
349} 358}
@@ -360,6 +369,8 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct
360 BUG_ON(!x); 369 BUG_ON(!x);
361 370
362 err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); 371 err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
372 if (err == 0)
373 atomic_inc(&selinux_xfrm_refcount);
363 return err; 374 return err;
364} 375}
365 376
@@ -382,10 +393,13 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
382 struct xfrm_sec_ctx *ctx = x->security; 393 struct xfrm_sec_ctx *ctx = x->security;
383 int rc = 0; 394 int rc = 0;
384 395
385 if (ctx) 396 if (ctx) {
386 rc = avc_has_perm(tsec->sid, ctx->ctx_sid, 397 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
387 SECCLASS_ASSOCIATION, 398 SECCLASS_ASSOCIATION,
388 ASSOCIATION__SETCONTEXT, NULL); 399 ASSOCIATION__SETCONTEXT, NULL);
400 if (rc == 0)
401 atomic_dec(&selinux_xfrm_refcount);
402 }
389 403
390 return rc; 404 return rc;
391} 405}
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index b4a38a3d855b..4bb97646a67a 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -711,11 +711,13 @@ static void snd_intel8x0_setup_periods(struct intel8x0 *chip, struct ichdev *ich
711static void fill_nocache(void *buf, int size, int nocache) 711static void fill_nocache(void *buf, int size, int nocache)
712{ 712{
713 size = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 713 size = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
714 change_page_attr(virt_to_page(buf), size, nocache ? PAGE_KERNEL_NOCACHE : PAGE_KERNEL); 714 if (nocache)
715 global_flush_tlb(); 715 set_pages_uc(virt_to_page(buf), size);
716 else
717 set_pages_wb(virt_to_page(buf), size);
716} 718}
717#else 719#else
718#define fill_nocache(buf,size,nocache) 720#define fill_nocache(buf, size, nocache) do { ; } while (0)
719#endif 721#endif
720 722
721/* 723/*
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c
index c7992e667fdb..317f8e211cd2 100644
--- a/drivers/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -26,7 +26,7 @@
26 * Based on Xen 3.1 code. 26 * Based on Xen 3.1 code.
27 */ 27 */
28 28
29#include "kvm.h" 29#include <linux/kvm_host.h>
30#include <linux/kvm.h> 30#include <linux/kvm.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
@@ -34,14 +34,17 @@
34#include <linux/hrtimer.h> 34#include <linux/hrtimer.h>
35#include <linux/io.h> 35#include <linux/io.h>
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h> 37#include <asm/page.h>
39#include <asm/current.h> 38#include <asm/current.h>
40#include <asm/apicdef.h> 39
41#include <asm/io_apic.h> 40#include "ioapic.h"
42#include "irq.h" 41#include "lapic.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ 42
43#if 0
44#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
45#else
44#define ioapic_debug(fmt, arg...) 46#define ioapic_debug(fmt, arg...)
47#endif
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); 48static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46 49
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, 50static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
113 default: 116 default:
114 index = (ioapic->ioregsel - 0x10) >> 1; 117 index = (ioapic->ioregsel - 0x10) >> 1;
115 118
116 ioapic_debug("change redir index %x val %x", index, val); 119 ioapic_debug("change redir index %x val %x\n", index, val);
117 if (index >= IOAPIC_NUM_PINS) 120 if (index >= IOAPIC_NUM_PINS)
118 return; 121 return;
119 if (ioapic->ioregsel & 1) { 122 if (ioapic->ioregsel & 1) {
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
131} 134}
132 135
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic, 136static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target, 137 struct kvm_vcpu *vcpu,
135 u8 vector, u8 trig_mode, u8 delivery_mode) 138 u8 vector, u8 trig_mode, u8 delivery_mode)
136{ 139{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, 140 ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
138 delivery_mode); 141 delivery_mode);
139 142
140 ASSERT((delivery_mode == dest_Fixed) || 143 ASSERT((delivery_mode == IOAPIC_FIXED) ||
141 (delivery_mode == dest_LowestPrio)); 144 (delivery_mode == IOAPIC_LOWEST_PRIORITY));
142 145
143 kvm_apic_set_irq(target, vector, trig_mode); 146 kvm_apic_set_irq(vcpu, vector, trig_mode);
144} 147}
145 148
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, 149static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
151 struct kvm *kvm = ioapic->kvm; 154 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu; 155 struct kvm_vcpu *vcpu;
153 156
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode); 157 ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
155 158
156 if (dest_mode == 0) { /* Physical mode. */ 159 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */ 160 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i) 161 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic) 162 if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
160 mask |= 1 << i; 163 mask |= 1 << i;
161 return mask; 164 return mask;
162 } 165 }
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
164 vcpu = kvm->vcpus[i]; 167 vcpu = kvm->vcpus[i];
165 if (!vcpu) 168 if (!vcpu)
166 continue; 169 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { 170 if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
168 if (vcpu->apic) 171 if (vcpu->arch.apic)
169 mask = 1 << i; 172 mask = 1 << i;
170 break; 173 break;
171 } 174 }
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
175 vcpu = kvm->vcpus[i]; 178 vcpu = kvm->vcpus[i];
176 if (!vcpu) 179 if (!vcpu)
177 continue; 180 continue;
178 if (vcpu->apic && 181 if (vcpu->arch.apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest)) 182 kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
180 mask |= 1 << vcpu->vcpu_id; 183 mask |= 1 << vcpu->vcpu_id;
181 } 184 }
182 ioapic_debug("mask %x", mask); 185 ioapic_debug("mask %x\n", mask);
183 return mask; 186 return mask;
184} 187}
185 188
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
191 u8 vector = ioapic->redirtbl[irq].fields.vector; 194 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; 195 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask; 196 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu; 197 struct kvm_vcpu *vcpu;
196 int vcpu_id; 198 int vcpu_id;
197 199
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " 200 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x", 201 "vector=%x trig_mode=%x\n",
200 dest, dest_mode, delivery_mode, vector, trig_mode); 202 dest, dest_mode, delivery_mode, vector, trig_mode);
201 203
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); 204 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) { 205 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination"); 206 ioapic_debug("no target on destination\n");
205 return; 207 return;
206 } 208 }
207 209
208 switch (delivery_mode) { 210 switch (delivery_mode) {
209 case dest_LowestPrio: 211 case IOAPIC_LOWEST_PRIORITY:
210 target = 212 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); 213 deliver_bitmask);
212 if (target != NULL) 214 if (vcpu != NULL)
213 ioapic_inj_irq(ioapic, target, vector, 215 ioapic_inj_irq(ioapic, vcpu, vector,
214 trig_mode, delivery_mode); 216 trig_mode, delivery_mode);
215 else 217 else
216 ioapic_debug("null round robin: " 218 ioapic_debug("null lowest prio vcpu: "
217 "mask=%x vector=%x delivery_mode=%x", 219 "mask=%x vector=%x delivery_mode=%x\n",
218 deliver_bitmask, vector, dest_LowestPrio); 220 deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
219 break; 221 break;
220 case dest_Fixed: 222 case IOAPIC_FIXED:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { 223 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id))) 224 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue; 225 continue;
224 deliver_bitmask &= ~(1 << vcpu_id); 226 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id]; 227 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) { 228 if (vcpu) {
227 target = vcpu->apic; 229 ioapic_inj_irq(ioapic, vcpu, vector,
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode); 230 trig_mode, delivery_mode);
230 } 231 }
231 } 232 }
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
271 272
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 273void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{ 274{
274 struct kvm_ioapic *ioapic = kvm->vioapic; 275 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
275 union ioapic_redir_entry *ent; 276 union ioapic_redir_entry *ent;
276 int gsi; 277 int gsi;
277 278
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 305 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result; 306 u32 result;
306 307
307 ioapic_debug("addr %lx", (unsigned long)addr); 308 ioapic_debug("addr %lx\n", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */ 309 ASSERT(!(addr & 0xf)); /* check alignment */
309 310
310 addr &= 0xff; 311 addr &= 0xff;
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 342 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data; 343 u32 data;
343 344
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", 345 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
345 addr, len, val); 346 (void*)addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */ 347 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8) 348 if (len == 4 || len == 8)
348 data = *(u32 *) val; 349 data = *(u32 *) val;
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
360 case IOAPIC_REG_WINDOW: 361 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data); 362 ioapic_write_indirect(ioapic, data);
362 break; 363 break;
364#ifdef CONFIG_IA64
365 case IOAPIC_REG_EOI:
366 kvm_ioapic_update_eoi(ioapic->kvm, data);
367 break;
368#endif
363 369
364 default: 370 default:
365 break; 371 break;
366 } 372 }
367} 373}
368 374
375void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
376{
377 int i;
378
379 for (i = 0; i < IOAPIC_NUM_PINS; i++)
380 ioapic->redirtbl[i].fields.mask = 1;
381 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
382 ioapic->ioregsel = 0;
383 ioapic->irr = 0;
384 ioapic->id = 0;
385}
386
369int kvm_ioapic_init(struct kvm *kvm) 387int kvm_ioapic_init(struct kvm *kvm)
370{ 388{
371 struct kvm_ioapic *ioapic; 389 struct kvm_ioapic *ioapic;
372 int i;
373 390
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 391 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic) 392 if (!ioapic)
376 return -ENOMEM; 393 return -ENOMEM;
377 kvm->vioapic = ioapic; 394 kvm->arch.vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++) 395 kvm_ioapic_reset(ioapic);
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read; 396 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write; 397 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range; 398 ioapic->dev.in_range = ioapic_in_range;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
new file mode 100644
index 000000000000..7f16675fe783
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
1#ifndef __KVM_IO_APIC_H
2#define __KVM_IO_APIC_H
3
4#include <linux/kvm_host.h>
5
6#include "iodev.h"
7
8struct kvm;
9struct kvm_vcpu;
10
11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
12#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13#define IOAPIC_EDGE_TRIG 0
14#define IOAPIC_LEVEL_TRIG 1
15
16#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
17#define IOAPIC_MEM_LENGTH 0x100
18
19/* Direct registers. */
20#define IOAPIC_REG_SELECT 0x00
21#define IOAPIC_REG_WINDOW 0x10
22#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
23
24/* Indirect registers. */
25#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
26#define IOAPIC_REG_VERSION 0x01
27#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
28
29/*ioapic delivery mode*/
30#define IOAPIC_FIXED 0x0
31#define IOAPIC_LOWEST_PRIORITY 0x1
32#define IOAPIC_PMI 0x2
33#define IOAPIC_NMI 0x4
34#define IOAPIC_INIT 0x5
35#define IOAPIC_EXTINT 0x7
36
37struct kvm_ioapic {
38 u64 base_address;
39 u32 ioregsel;
40 u32 id;
41 u32 irr;
42 u32 pad;
43 union ioapic_redir_entry {
44 u64 bits;
45 struct {
46 u8 vector;
47 u8 delivery_mode:3;
48 u8 dest_mode:1;
49 u8 delivery_status:1;
50 u8 polarity:1;
51 u8 remote_irr:1;
52 u8 trig_mode:1;
53 u8 mask:1;
54 u8 reserve:7;
55 u8 reserved[4];
56 u8 dest_id;
57 } fields;
58 } redirtbl[IOAPIC_NUM_PINS];
59 struct kvm_io_device dev;
60 struct kvm *kvm;
61};
62
63#ifdef DEBUG
64#define ASSERT(x) \
65do { \
66 if (!(x)) { \
67 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
68 __FILE__, __LINE__, #x); \
69 BUG(); \
70 } \
71} while (0)
72#else
73#define ASSERT(x) do { } while (0)
74#endif
75
76static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
77{
78 return kvm->arch.vioapic;
79}
80
81#ifdef CONFIG_IA64
82static inline int irqchip_in_kernel(struct kvm *kvm)
83{
84 return 1;
85}
86#endif
87
88struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
89 unsigned long bitmap);
90void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
91int kvm_ioapic_init(struct kvm *kvm);
92void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
93void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
94
95#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
new file mode 100644
index 000000000000..c14e642027b2
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 */
15
16#ifndef __KVM_IODEV_H__
17#define __KVM_IODEV_H__
18
19#include <linux/kvm_types.h>
20
21struct kvm_io_device {
22 void (*read)(struct kvm_io_device *this,
23 gpa_t addr,
24 int len,
25 void *val);
26 void (*write)(struct kvm_io_device *this,
27 gpa_t addr,
28 int len,
29 const void *val);
30 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
31 void (*destructor)(struct kvm_io_device *this);
32
33 void *private;
34};
35
36static inline void kvm_iodevice_read(struct kvm_io_device *dev,
37 gpa_t addr,
38 int len,
39 void *val)
40{
41 dev->read(dev, addr, len, val);
42}
43
44static inline void kvm_iodevice_write(struct kvm_io_device *dev,
45 gpa_t addr,
46 int len,
47 const void *val)
48{
49 dev->write(dev, addr, len, val);
50}
51
52static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
53{
54 return dev->in_range(dev, addr);
55}
56
57static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
58{
59 if (dev->destructor)
60 dev->destructor(dev);
61}
62
63#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
new file mode 100644
index 000000000000..3c4fe26096fc
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "iodev.h"
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/module.h>
23#include <linux/errno.h>
24#include <linux/percpu.h>
25#include <linux/gfp.h>
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
29#include <linux/reboot.h>
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
33#include <linux/sysdev.h>
34#include <linux/cpu.h>
35#include <linux/sched.h>
36#include <linux/cpumask.h>
37#include <linux/smp.h>
38#include <linux/anon_inodes.h>
39#include <linux/profile.h>
40#include <linux/kvm_para.h>
41#include <linux/pagemap.h>
42#include <linux/mman.h>
43
44#include <asm/processor.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/pgtable.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51
52DEFINE_SPINLOCK(kvm_lock);
53LIST_HEAD(vm_list);
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kmem_cache *kvm_vcpu_cache;
58EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
59
60static __read_mostly struct preempt_ops kvm_preempt_ops;
61
62static struct dentry *debugfs_dir;
63
64static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
65 unsigned long arg);
66
67static inline int valid_vcpu(int n)
68{
69 return likely(n >= 0 && n < KVM_MAX_VCPUS);
70}
71
72/*
73 * Switches to specified vcpu, until a matching vcpu_put()
74 */
75void vcpu_load(struct kvm_vcpu *vcpu)
76{
77 int cpu;
78
79 mutex_lock(&vcpu->mutex);
80 cpu = get_cpu();
81 preempt_notifier_register(&vcpu->preempt_notifier);
82 kvm_arch_vcpu_load(vcpu, cpu);
83 put_cpu();
84}
85
86void vcpu_put(struct kvm_vcpu *vcpu)
87{
88 preempt_disable();
89 kvm_arch_vcpu_put(vcpu);
90 preempt_notifier_unregister(&vcpu->preempt_notifier);
91 preempt_enable();
92 mutex_unlock(&vcpu->mutex);
93}
94
95static void ack_flush(void *_completed)
96{
97}
98
99void kvm_flush_remote_tlbs(struct kvm *kvm)
100{
101 int i, cpu;
102 cpumask_t cpus;
103 struct kvm_vcpu *vcpu;
104
105 cpus_clear(cpus);
106 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
107 vcpu = kvm->vcpus[i];
108 if (!vcpu)
109 continue;
110 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
111 continue;
112 cpu = vcpu->cpu;
113 if (cpu != -1 && cpu != raw_smp_processor_id())
114 cpu_set(cpu, cpus);
115 }
116 if (cpus_empty(cpus))
117 return;
118 ++kvm->stat.remote_tlb_flush;
119 smp_call_function_mask(cpus, ack_flush, NULL, 1);
120}
121
122int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
123{
124 struct page *page;
125 int r;
126
127 mutex_init(&vcpu->mutex);
128 vcpu->cpu = -1;
129 vcpu->kvm = kvm;
130 vcpu->vcpu_id = id;
131 init_waitqueue_head(&vcpu->wq);
132
133 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
134 if (!page) {
135 r = -ENOMEM;
136 goto fail;
137 }
138 vcpu->run = page_address(page);
139
140 r = kvm_arch_vcpu_init(vcpu);
141 if (r < 0)
142 goto fail_free_run;
143 return 0;
144
145fail_free_run:
146 free_page((unsigned long)vcpu->run);
147fail:
148 return r;
149}
150EXPORT_SYMBOL_GPL(kvm_vcpu_init);
151
152void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
153{
154 kvm_arch_vcpu_uninit(vcpu);
155 free_page((unsigned long)vcpu->run);
156}
157EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
158
159static struct kvm *kvm_create_vm(void)
160{
161 struct kvm *kvm = kvm_arch_create_vm();
162
163 if (IS_ERR(kvm))
164 goto out;
165
166 kvm->mm = current->mm;
167 atomic_inc(&kvm->mm->mm_count);
168 spin_lock_init(&kvm->mmu_lock);
169 kvm_io_bus_init(&kvm->pio_bus);
170 mutex_init(&kvm->lock);
171 kvm_io_bus_init(&kvm->mmio_bus);
172 spin_lock(&kvm_lock);
173 list_add(&kvm->vm_list, &vm_list);
174 spin_unlock(&kvm_lock);
175out:
176 return kvm;
177}
178
179/*
180 * Free any memory in @free but not in @dont.
181 */
182static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
183 struct kvm_memory_slot *dont)
184{
185 if (!dont || free->rmap != dont->rmap)
186 vfree(free->rmap);
187
188 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
189 vfree(free->dirty_bitmap);
190
191 free->npages = 0;
192 free->dirty_bitmap = NULL;
193 free->rmap = NULL;
194}
195
196void kvm_free_physmem(struct kvm *kvm)
197{
198 int i;
199
200 for (i = 0; i < kvm->nmemslots; ++i)
201 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
202}
203
204static void kvm_destroy_vm(struct kvm *kvm)
205{
206 struct mm_struct *mm = kvm->mm;
207
208 spin_lock(&kvm_lock);
209 list_del(&kvm->vm_list);
210 spin_unlock(&kvm_lock);
211 kvm_io_bus_destroy(&kvm->pio_bus);
212 kvm_io_bus_destroy(&kvm->mmio_bus);
213 kvm_arch_destroy_vm(kvm);
214 mmdrop(mm);
215}
216
217static int kvm_vm_release(struct inode *inode, struct file *filp)
218{
219 struct kvm *kvm = filp->private_data;
220
221 kvm_destroy_vm(kvm);
222 return 0;
223}
224
225/*
226 * Allocate some memory and give it an address in the guest physical address
227 * space.
228 *
229 * Discontiguous memory is allowed, mostly for framebuffers.
230 *
231 * Must be called holding mmap_sem for write.
232 */
233int __kvm_set_memory_region(struct kvm *kvm,
234 struct kvm_userspace_memory_region *mem,
235 int user_alloc)
236{
237 int r;
238 gfn_t base_gfn;
239 unsigned long npages;
240 unsigned long i;
241 struct kvm_memory_slot *memslot;
242 struct kvm_memory_slot old, new;
243
244 r = -EINVAL;
245 /* General sanity checks */
246 if (mem->memory_size & (PAGE_SIZE - 1))
247 goto out;
248 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
249 goto out;
250 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
251 goto out;
252 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
253 goto out;
254
255 memslot = &kvm->memslots[mem->slot];
256 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
257 npages = mem->memory_size >> PAGE_SHIFT;
258
259 if (!npages)
260 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
261
262 new = old = *memslot;
263
264 new.base_gfn = base_gfn;
265 new.npages = npages;
266 new.flags = mem->flags;
267
268 /* Disallow changing a memory slot's size. */
269 r = -EINVAL;
270 if (npages && old.npages && npages != old.npages)
271 goto out_free;
272
273 /* Check for overlaps */
274 r = -EEXIST;
275 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
276 struct kvm_memory_slot *s = &kvm->memslots[i];
277
278 if (s == memslot)
279 continue;
280 if (!((base_gfn + npages <= s->base_gfn) ||
281 (base_gfn >= s->base_gfn + s->npages)))
282 goto out_free;
283 }
284
285 /* Free page dirty bitmap if unneeded */
286 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
287 new.dirty_bitmap = NULL;
288
289 r = -ENOMEM;
290
291 /* Allocate if a slot is being created */
292 if (npages && !new.rmap) {
293 new.rmap = vmalloc(npages * sizeof(struct page *));
294
295 if (!new.rmap)
296 goto out_free;
297
298 memset(new.rmap, 0, npages * sizeof(*new.rmap));
299
300 new.user_alloc = user_alloc;
301 new.userspace_addr = mem->userspace_addr;
302 }
303
304 /* Allocate page dirty bitmap if needed */
305 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
306 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
307
308 new.dirty_bitmap = vmalloc(dirty_bytes);
309 if (!new.dirty_bitmap)
310 goto out_free;
311 memset(new.dirty_bitmap, 0, dirty_bytes);
312 }
313
314 if (mem->slot >= kvm->nmemslots)
315 kvm->nmemslots = mem->slot + 1;
316
317 *memslot = new;
318
319 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
320 if (r) {
321 *memslot = old;
322 goto out_free;
323 }
324
325 kvm_free_physmem_slot(&old, &new);
326 return 0;
327
328out_free:
329 kvm_free_physmem_slot(&new, &old);
330out:
331 return r;
332
333}
334EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
335
336int kvm_set_memory_region(struct kvm *kvm,
337 struct kvm_userspace_memory_region *mem,
338 int user_alloc)
339{
340 int r;
341
342 down_write(&current->mm->mmap_sem);
343 r = __kvm_set_memory_region(kvm, mem, user_alloc);
344 up_write(&current->mm->mmap_sem);
345 return r;
346}
347EXPORT_SYMBOL_GPL(kvm_set_memory_region);
348
349int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
350 struct
351 kvm_userspace_memory_region *mem,
352 int user_alloc)
353{
354 if (mem->slot >= KVM_MEMORY_SLOTS)
355 return -EINVAL;
356 return kvm_set_memory_region(kvm, mem, user_alloc);
357}
358
359int kvm_get_dirty_log(struct kvm *kvm,
360 struct kvm_dirty_log *log, int *is_dirty)
361{
362 struct kvm_memory_slot *memslot;
363 int r, i;
364 int n;
365 unsigned long any = 0;
366
367 r = -EINVAL;
368 if (log->slot >= KVM_MEMORY_SLOTS)
369 goto out;
370
371 memslot = &kvm->memslots[log->slot];
372 r = -ENOENT;
373 if (!memslot->dirty_bitmap)
374 goto out;
375
376 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
377
378 for (i = 0; !any && i < n/sizeof(long); ++i)
379 any = memslot->dirty_bitmap[i];
380
381 r = -EFAULT;
382 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
383 goto out;
384
385 if (any)
386 *is_dirty = 1;
387
388 r = 0;
389out:
390 return r;
391}
392
393int is_error_page(struct page *page)
394{
395 return page == bad_page;
396}
397EXPORT_SYMBOL_GPL(is_error_page);
398
399static inline unsigned long bad_hva(void)
400{
401 return PAGE_OFFSET;
402}
403
404int kvm_is_error_hva(unsigned long addr)
405{
406 return addr == bad_hva();
407}
408EXPORT_SYMBOL_GPL(kvm_is_error_hva);
409
410static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
411{
412 int i;
413
414 for (i = 0; i < kvm->nmemslots; ++i) {
415 struct kvm_memory_slot *memslot = &kvm->memslots[i];
416
417 if (gfn >= memslot->base_gfn
418 && gfn < memslot->base_gfn + memslot->npages)
419 return memslot;
420 }
421 return NULL;
422}
423
424struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
425{
426 gfn = unalias_gfn(kvm, gfn);
427 return __gfn_to_memslot(kvm, gfn);
428}
429
430int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
431{
432 int i;
433
434 gfn = unalias_gfn(kvm, gfn);
435 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
436 struct kvm_memory_slot *memslot = &kvm->memslots[i];
437
438 if (gfn >= memslot->base_gfn
439 && gfn < memslot->base_gfn + memslot->npages)
440 return 1;
441 }
442 return 0;
443}
444EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
445
446static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
447{
448 struct kvm_memory_slot *slot;
449
450 gfn = unalias_gfn(kvm, gfn);
451 slot = __gfn_to_memslot(kvm, gfn);
452 if (!slot)
453 return bad_hva();
454 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
455}
456
457/*
458 * Requires current->mm->mmap_sem to be held
459 */
460struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
461{
462 struct page *page[1];
463 unsigned long addr;
464 int npages;
465
466 might_sleep();
467
468 addr = gfn_to_hva(kvm, gfn);
469 if (kvm_is_error_hva(addr)) {
470 get_page(bad_page);
471 return bad_page;
472 }
473
474 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
475 NULL);
476
477 if (npages != 1) {
478 get_page(bad_page);
479 return bad_page;
480 }
481
482 return page[0];
483}
484
485EXPORT_SYMBOL_GPL(gfn_to_page);
486
487void kvm_release_page_clean(struct page *page)
488{
489 put_page(page);
490}
491EXPORT_SYMBOL_GPL(kvm_release_page_clean);
492
493void kvm_release_page_dirty(struct page *page)
494{
495 if (!PageReserved(page))
496 SetPageDirty(page);
497 put_page(page);
498}
499EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
500
501static int next_segment(unsigned long len, int offset)
502{
503 if (len > PAGE_SIZE - offset)
504 return PAGE_SIZE - offset;
505 else
506 return len;
507}
508
509int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
510 int len)
511{
512 int r;
513 unsigned long addr;
514
515 addr = gfn_to_hva(kvm, gfn);
516 if (kvm_is_error_hva(addr))
517 return -EFAULT;
518 r = copy_from_user(data, (void __user *)addr + offset, len);
519 if (r)
520 return -EFAULT;
521 return 0;
522}
523EXPORT_SYMBOL_GPL(kvm_read_guest_page);
524
525int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
526{
527 gfn_t gfn = gpa >> PAGE_SHIFT;
528 int seg;
529 int offset = offset_in_page(gpa);
530 int ret;
531
532 while ((seg = next_segment(len, offset)) != 0) {
533 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
534 if (ret < 0)
535 return ret;
536 offset = 0;
537 len -= seg;
538 data += seg;
539 ++gfn;
540 }
541 return 0;
542}
543EXPORT_SYMBOL_GPL(kvm_read_guest);
544
545int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
546 unsigned long len)
547{
548 int r;
549 unsigned long addr;
550 gfn_t gfn = gpa >> PAGE_SHIFT;
551 int offset = offset_in_page(gpa);
552
553 addr = gfn_to_hva(kvm, gfn);
554 if (kvm_is_error_hva(addr))
555 return -EFAULT;
556 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
557 if (r)
558 return -EFAULT;
559 return 0;
560}
561EXPORT_SYMBOL(kvm_read_guest_atomic);
562
563int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
564 int offset, int len)
565{
566 int r;
567 unsigned long addr;
568
569 addr = gfn_to_hva(kvm, gfn);
570 if (kvm_is_error_hva(addr))
571 return -EFAULT;
572 r = copy_to_user((void __user *)addr + offset, data, len);
573 if (r)
574 return -EFAULT;
575 mark_page_dirty(kvm, gfn);
576 return 0;
577}
578EXPORT_SYMBOL_GPL(kvm_write_guest_page);
579
580int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
581 unsigned long len)
582{
583 gfn_t gfn = gpa >> PAGE_SHIFT;
584 int seg;
585 int offset = offset_in_page(gpa);
586 int ret;
587
588 while ((seg = next_segment(len, offset)) != 0) {
589 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
590 if (ret < 0)
591 return ret;
592 offset = 0;
593 len -= seg;
594 data += seg;
595 ++gfn;
596 }
597 return 0;
598}
599
600int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
601{
602 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
603}
604EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
605
606int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
607{
608 gfn_t gfn = gpa >> PAGE_SHIFT;
609 int seg;
610 int offset = offset_in_page(gpa);
611 int ret;
612
613 while ((seg = next_segment(len, offset)) != 0) {
614 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
615 if (ret < 0)
616 return ret;
617 offset = 0;
618 len -= seg;
619 ++gfn;
620 }
621 return 0;
622}
623EXPORT_SYMBOL_GPL(kvm_clear_guest);
624
625void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
626{
627 struct kvm_memory_slot *memslot;
628
629 gfn = unalias_gfn(kvm, gfn);
630 memslot = __gfn_to_memslot(kvm, gfn);
631 if (memslot && memslot->dirty_bitmap) {
632 unsigned long rel_gfn = gfn - memslot->base_gfn;
633
634 /* avoid RMW */
635 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
636 set_bit(rel_gfn, memslot->dirty_bitmap);
637 }
638}
639
640/*
641 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
642 */
643void kvm_vcpu_block(struct kvm_vcpu *vcpu)
644{
645 DECLARE_WAITQUEUE(wait, current);
646
647 add_wait_queue(&vcpu->wq, &wait);
648
649 /*
650 * We will block until either an interrupt or a signal wakes us up
651 */
652 while (!kvm_cpu_has_interrupt(vcpu)
653 && !signal_pending(current)
654 && !kvm_arch_vcpu_runnable(vcpu)) {
655 set_current_state(TASK_INTERRUPTIBLE);
656 vcpu_put(vcpu);
657 schedule();
658 vcpu_load(vcpu);
659 }
660
661 __set_current_state(TASK_RUNNING);
662 remove_wait_queue(&vcpu->wq, &wait);
663}
664
665void kvm_resched(struct kvm_vcpu *vcpu)
666{
667 if (!need_resched())
668 return;
669 cond_resched();
670}
671EXPORT_SYMBOL_GPL(kvm_resched);
672
673static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
674{
675 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
676 struct page *page;
677
678 if (vmf->pgoff == 0)
679 page = virt_to_page(vcpu->run);
680 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
681 page = virt_to_page(vcpu->arch.pio_data);
682 else
683 return VM_FAULT_SIGBUS;
684 get_page(page);
685 vmf->page = page;
686 return 0;
687}
688
689static struct vm_operations_struct kvm_vcpu_vm_ops = {
690 .fault = kvm_vcpu_fault,
691};
692
693static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
694{
695 vma->vm_ops = &kvm_vcpu_vm_ops;
696 return 0;
697}
698
699static int kvm_vcpu_release(struct inode *inode, struct file *filp)
700{
701 struct kvm_vcpu *vcpu = filp->private_data;
702
703 fput(vcpu->kvm->filp);
704 return 0;
705}
706
707static struct file_operations kvm_vcpu_fops = {
708 .release = kvm_vcpu_release,
709 .unlocked_ioctl = kvm_vcpu_ioctl,
710 .compat_ioctl = kvm_vcpu_ioctl,
711 .mmap = kvm_vcpu_mmap,
712};
713
714/*
715 * Allocates an inode for the vcpu.
716 */
717static int create_vcpu_fd(struct kvm_vcpu *vcpu)
718{
719 int fd, r;
720 struct inode *inode;
721 struct file *file;
722
723 r = anon_inode_getfd(&fd, &inode, &file,
724 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
725 if (r)
726 return r;
727 atomic_inc(&vcpu->kvm->filp->f_count);
728 return fd;
729}
730
731/*
732 * Creates some virtual cpus. Good luck creating more than one.
733 */
734static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
735{
736 int r;
737 struct kvm_vcpu *vcpu;
738
739 if (!valid_vcpu(n))
740 return -EINVAL;
741
742 vcpu = kvm_arch_vcpu_create(kvm, n);
743 if (IS_ERR(vcpu))
744 return PTR_ERR(vcpu);
745
746 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
747
748 r = kvm_arch_vcpu_setup(vcpu);
749 if (r)
750 goto vcpu_destroy;
751
752 mutex_lock(&kvm->lock);
753 if (kvm->vcpus[n]) {
754 r = -EEXIST;
755 mutex_unlock(&kvm->lock);
756 goto vcpu_destroy;
757 }
758 kvm->vcpus[n] = vcpu;
759 mutex_unlock(&kvm->lock);
760
761 /* Now it's all set up, let userspace reach it */
762 r = create_vcpu_fd(vcpu);
763 if (r < 0)
764 goto unlink;
765 return r;
766
767unlink:
768 mutex_lock(&kvm->lock);
769 kvm->vcpus[n] = NULL;
770 mutex_unlock(&kvm->lock);
771vcpu_destroy:
772 kvm_arch_vcpu_destroy(vcpu);
773 return r;
774}
775
776static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
777{
778 if (sigset) {
779 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
780 vcpu->sigset_active = 1;
781 vcpu->sigset = *sigset;
782 } else
783 vcpu->sigset_active = 0;
784 return 0;
785}
786
787static long kvm_vcpu_ioctl(struct file *filp,
788 unsigned int ioctl, unsigned long arg)
789{
790 struct kvm_vcpu *vcpu = filp->private_data;
791 void __user *argp = (void __user *)arg;
792 int r;
793
794 if (vcpu->kvm->mm != current->mm)
795 return -EIO;
796 switch (ioctl) {
797 case KVM_RUN:
798 r = -EINVAL;
799 if (arg)
800 goto out;
801 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
802 break;
803 case KVM_GET_REGS: {
804 struct kvm_regs kvm_regs;
805
806 memset(&kvm_regs, 0, sizeof kvm_regs);
807 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
808 if (r)
809 goto out;
810 r = -EFAULT;
811 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
812 goto out;
813 r = 0;
814 break;
815 }
816 case KVM_SET_REGS: {
817 struct kvm_regs kvm_regs;
818
819 r = -EFAULT;
820 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
821 goto out;
822 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
823 if (r)
824 goto out;
825 r = 0;
826 break;
827 }
828 case KVM_GET_SREGS: {
829 struct kvm_sregs kvm_sregs;
830
831 memset(&kvm_sregs, 0, sizeof kvm_sregs);
832 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
833 if (r)
834 goto out;
835 r = -EFAULT;
836 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
837 goto out;
838 r = 0;
839 break;
840 }
841 case KVM_SET_SREGS: {
842 struct kvm_sregs kvm_sregs;
843
844 r = -EFAULT;
845 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
846 goto out;
847 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
848 if (r)
849 goto out;
850 r = 0;
851 break;
852 }
853 case KVM_TRANSLATE: {
854 struct kvm_translation tr;
855
856 r = -EFAULT;
857 if (copy_from_user(&tr, argp, sizeof tr))
858 goto out;
859 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
860 if (r)
861 goto out;
862 r = -EFAULT;
863 if (copy_to_user(argp, &tr, sizeof tr))
864 goto out;
865 r = 0;
866 break;
867 }
868 case KVM_DEBUG_GUEST: {
869 struct kvm_debug_guest dbg;
870
871 r = -EFAULT;
872 if (copy_from_user(&dbg, argp, sizeof dbg))
873 goto out;
874 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
875 if (r)
876 goto out;
877 r = 0;
878 break;
879 }
880 case KVM_SET_SIGNAL_MASK: {
881 struct kvm_signal_mask __user *sigmask_arg = argp;
882 struct kvm_signal_mask kvm_sigmask;
883 sigset_t sigset, *p;
884
885 p = NULL;
886 if (argp) {
887 r = -EFAULT;
888 if (copy_from_user(&kvm_sigmask, argp,
889 sizeof kvm_sigmask))
890 goto out;
891 r = -EINVAL;
892 if (kvm_sigmask.len != sizeof sigset)
893 goto out;
894 r = -EFAULT;
895 if (copy_from_user(&sigset, sigmask_arg->sigset,
896 sizeof sigset))
897 goto out;
898 p = &sigset;
899 }
900 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
901 break;
902 }
903 case KVM_GET_FPU: {
904 struct kvm_fpu fpu;
905
906 memset(&fpu, 0, sizeof fpu);
907 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
908 if (r)
909 goto out;
910 r = -EFAULT;
911 if (copy_to_user(argp, &fpu, sizeof fpu))
912 goto out;
913 r = 0;
914 break;
915 }
916 case KVM_SET_FPU: {
917 struct kvm_fpu fpu;
918
919 r = -EFAULT;
920 if (copy_from_user(&fpu, argp, sizeof fpu))
921 goto out;
922 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
923 if (r)
924 goto out;
925 r = 0;
926 break;
927 }
928 default:
929 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
930 }
931out:
932 return r;
933}
934
935static long kvm_vm_ioctl(struct file *filp,
936 unsigned int ioctl, unsigned long arg)
937{
938 struct kvm *kvm = filp->private_data;
939 void __user *argp = (void __user *)arg;
940 int r;
941
942 if (kvm->mm != current->mm)
943 return -EIO;
944 switch (ioctl) {
945 case KVM_CREATE_VCPU:
946 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
947 if (r < 0)
948 goto out;
949 break;
950 case KVM_SET_USER_MEMORY_REGION: {
951 struct kvm_userspace_memory_region kvm_userspace_mem;
952
953 r = -EFAULT;
954 if (copy_from_user(&kvm_userspace_mem, argp,
955 sizeof kvm_userspace_mem))
956 goto out;
957
958 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
959 if (r)
960 goto out;
961 break;
962 }
963 case KVM_GET_DIRTY_LOG: {
964 struct kvm_dirty_log log;
965
966 r = -EFAULT;
967 if (copy_from_user(&log, argp, sizeof log))
968 goto out;
969 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
970 if (r)
971 goto out;
972 break;
973 }
974 default:
975 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
976 }
977out:
978 return r;
979}
980
981static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
982{
983 struct kvm *kvm = vma->vm_file->private_data;
984 struct page *page;
985
986 if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
987 return VM_FAULT_SIGBUS;
988 page = gfn_to_page(kvm, vmf->pgoff);
989 if (is_error_page(page)) {
990 kvm_release_page_clean(page);
991 return VM_FAULT_SIGBUS;
992 }
993 vmf->page = page;
994 return 0;
995}
996
997static struct vm_operations_struct kvm_vm_vm_ops = {
998 .fault = kvm_vm_fault,
999};
1000
1001static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1002{
1003 vma->vm_ops = &kvm_vm_vm_ops;
1004 return 0;
1005}
1006
1007static struct file_operations kvm_vm_fops = {
1008 .release = kvm_vm_release,
1009 .unlocked_ioctl = kvm_vm_ioctl,
1010 .compat_ioctl = kvm_vm_ioctl,
1011 .mmap = kvm_vm_mmap,
1012};
1013
1014static int kvm_dev_ioctl_create_vm(void)
1015{
1016 int fd, r;
1017 struct inode *inode;
1018 struct file *file;
1019 struct kvm *kvm;
1020
1021 kvm = kvm_create_vm();
1022 if (IS_ERR(kvm))
1023 return PTR_ERR(kvm);
1024 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1025 if (r) {
1026 kvm_destroy_vm(kvm);
1027 return r;
1028 }
1029
1030 kvm->filp = file;
1031
1032 return fd;
1033}
1034
1035static long kvm_dev_ioctl(struct file *filp,
1036 unsigned int ioctl, unsigned long arg)
1037{
1038 void __user *argp = (void __user *)arg;
1039 long r = -EINVAL;
1040
1041 switch (ioctl) {
1042 case KVM_GET_API_VERSION:
1043 r = -EINVAL;
1044 if (arg)
1045 goto out;
1046 r = KVM_API_VERSION;
1047 break;
1048 case KVM_CREATE_VM:
1049 r = -EINVAL;
1050 if (arg)
1051 goto out;
1052 r = kvm_dev_ioctl_create_vm();
1053 break;
1054 case KVM_CHECK_EXTENSION:
1055 r = kvm_dev_ioctl_check_extension((long)argp);
1056 break;
1057 case KVM_GET_VCPU_MMAP_SIZE:
1058 r = -EINVAL;
1059 if (arg)
1060 goto out;
1061 r = 2 * PAGE_SIZE;
1062 break;
1063 default:
1064 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1065 }
1066out:
1067 return r;
1068}
1069
1070static struct file_operations kvm_chardev_ops = {
1071 .unlocked_ioctl = kvm_dev_ioctl,
1072 .compat_ioctl = kvm_dev_ioctl,
1073};
1074
1075static struct miscdevice kvm_dev = {
1076 KVM_MINOR,
1077 "kvm",
1078 &kvm_chardev_ops,
1079};
1080
1081static void hardware_enable(void *junk)
1082{
1083 int cpu = raw_smp_processor_id();
1084
1085 if (cpu_isset(cpu, cpus_hardware_enabled))
1086 return;
1087 cpu_set(cpu, cpus_hardware_enabled);
1088 kvm_arch_hardware_enable(NULL);
1089}
1090
1091static void hardware_disable(void *junk)
1092{
1093 int cpu = raw_smp_processor_id();
1094
1095 if (!cpu_isset(cpu, cpus_hardware_enabled))
1096 return;
1097 cpu_clear(cpu, cpus_hardware_enabled);
1098 decache_vcpus_on_cpu(cpu);
1099 kvm_arch_hardware_disable(NULL);
1100}
1101
1102static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1103 void *v)
1104{
1105 int cpu = (long)v;
1106
1107 val &= ~CPU_TASKS_FROZEN;
1108 switch (val) {
1109 case CPU_DYING:
1110 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1111 cpu);
1112 hardware_disable(NULL);
1113 break;
1114 case CPU_UP_CANCELED:
1115 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1116 cpu);
1117 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
1118 break;
1119 case CPU_ONLINE:
1120 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1121 cpu);
1122 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
1123 break;
1124 }
1125 return NOTIFY_OK;
1126}
1127
1128static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1129 void *v)
1130{
1131 if (val == SYS_RESTART) {
1132 /*
1133 * Some (well, at least mine) BIOSes hang on reboot if
1134 * in vmx root mode.
1135 */
1136 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1137 on_each_cpu(hardware_disable, NULL, 0, 1);
1138 }
1139 return NOTIFY_OK;
1140}
1141
1142static struct notifier_block kvm_reboot_notifier = {
1143 .notifier_call = kvm_reboot,
1144 .priority = 0,
1145};
1146
1147void kvm_io_bus_init(struct kvm_io_bus *bus)
1148{
1149 memset(bus, 0, sizeof(*bus));
1150}
1151
1152void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1153{
1154 int i;
1155
1156 for (i = 0; i < bus->dev_count; i++) {
1157 struct kvm_io_device *pos = bus->devs[i];
1158
1159 kvm_iodevice_destructor(pos);
1160 }
1161}
1162
1163struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
1164{
1165 int i;
1166
1167 for (i = 0; i < bus->dev_count; i++) {
1168 struct kvm_io_device *pos = bus->devs[i];
1169
1170 if (pos->in_range(pos, addr))
1171 return pos;
1172 }
1173
1174 return NULL;
1175}
1176
1177void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1178{
1179 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1180
1181 bus->devs[bus->dev_count++] = dev;
1182}
1183
1184static struct notifier_block kvm_cpu_notifier = {
1185 .notifier_call = kvm_cpu_hotplug,
1186 .priority = 20, /* must be > scheduler priority */
1187};
1188
1189static u64 vm_stat_get(void *_offset)
1190{
1191 unsigned offset = (long)_offset;
1192 u64 total = 0;
1193 struct kvm *kvm;
1194
1195 spin_lock(&kvm_lock);
1196 list_for_each_entry(kvm, &vm_list, vm_list)
1197 total += *(u32 *)((void *)kvm + offset);
1198 spin_unlock(&kvm_lock);
1199 return total;
1200}
1201
1202DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
1203
1204static u64 vcpu_stat_get(void *_offset)
1205{
1206 unsigned offset = (long)_offset;
1207 u64 total = 0;
1208 struct kvm *kvm;
1209 struct kvm_vcpu *vcpu;
1210 int i;
1211
1212 spin_lock(&kvm_lock);
1213 list_for_each_entry(kvm, &vm_list, vm_list)
1214 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1215 vcpu = kvm->vcpus[i];
1216 if (vcpu)
1217 total += *(u32 *)((void *)vcpu + offset);
1218 }
1219 spin_unlock(&kvm_lock);
1220 return total;
1221}
1222
1223DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
1224
1225static struct file_operations *stat_fops[] = {
1226 [KVM_STAT_VCPU] = &vcpu_stat_fops,
1227 [KVM_STAT_VM] = &vm_stat_fops,
1228};
1229
1230static void kvm_init_debug(void)
1231{
1232 struct kvm_stats_debugfs_item *p;
1233
1234 debugfs_dir = debugfs_create_dir("kvm", NULL);
1235 for (p = debugfs_entries; p->name; ++p)
1236 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
1237 (void *)(long)p->offset,
1238 stat_fops[p->kind]);
1239}
1240
1241static void kvm_exit_debug(void)
1242{
1243 struct kvm_stats_debugfs_item *p;
1244
1245 for (p = debugfs_entries; p->name; ++p)
1246 debugfs_remove(p->dentry);
1247 debugfs_remove(debugfs_dir);
1248}
1249
1250static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1251{
1252 hardware_disable(NULL);
1253 return 0;
1254}
1255
1256static int kvm_resume(struct sys_device *dev)
1257{
1258 hardware_enable(NULL);
1259 return 0;
1260}
1261
1262static struct sysdev_class kvm_sysdev_class = {
1263 .name = "kvm",
1264 .suspend = kvm_suspend,
1265 .resume = kvm_resume,
1266};
1267
1268static struct sys_device kvm_sysdev = {
1269 .id = 0,
1270 .cls = &kvm_sysdev_class,
1271};
1272
1273struct page *bad_page;
1274
1275static inline
1276struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
1277{
1278 return container_of(pn, struct kvm_vcpu, preempt_notifier);
1279}
1280
1281static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
1282{
1283 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1284
1285 kvm_arch_vcpu_load(vcpu, cpu);
1286}
1287
1288static void kvm_sched_out(struct preempt_notifier *pn,
1289 struct task_struct *next)
1290{
1291 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1292
1293 kvm_arch_vcpu_put(vcpu);
1294}
1295
1296int kvm_init(void *opaque, unsigned int vcpu_size,
1297 struct module *module)
1298{
1299 int r;
1300 int cpu;
1301
1302 kvm_init_debug();
1303
1304 r = kvm_arch_init(opaque);
1305 if (r)
1306 goto out_fail;
1307
1308 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1309
1310 if (bad_page == NULL) {
1311 r = -ENOMEM;
1312 goto out;
1313 }
1314
1315 r = kvm_arch_hardware_setup();
1316 if (r < 0)
1317 goto out_free_0;
1318
1319 for_each_online_cpu(cpu) {
1320 smp_call_function_single(cpu,
1321 kvm_arch_check_processor_compat,
1322 &r, 0, 1);
1323 if (r < 0)
1324 goto out_free_1;
1325 }
1326
1327 on_each_cpu(hardware_enable, NULL, 0, 1);
1328 r = register_cpu_notifier(&kvm_cpu_notifier);
1329 if (r)
1330 goto out_free_2;
1331 register_reboot_notifier(&kvm_reboot_notifier);
1332
1333 r = sysdev_class_register(&kvm_sysdev_class);
1334 if (r)
1335 goto out_free_3;
1336
1337 r = sysdev_register(&kvm_sysdev);
1338 if (r)
1339 goto out_free_4;
1340
1341 /* A kmem cache lets us meet the alignment requirements of fx_save. */
1342 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
1343 __alignof__(struct kvm_vcpu),
1344 0, NULL);
1345 if (!kvm_vcpu_cache) {
1346 r = -ENOMEM;
1347 goto out_free_5;
1348 }
1349
1350 kvm_chardev_ops.owner = module;
1351
1352 r = misc_register(&kvm_dev);
1353 if (r) {
1354 printk(KERN_ERR "kvm: misc device register failed\n");
1355 goto out_free;
1356 }
1357
1358 kvm_preempt_ops.sched_in = kvm_sched_in;
1359 kvm_preempt_ops.sched_out = kvm_sched_out;
1360
1361 return 0;
1362
1363out_free:
1364 kmem_cache_destroy(kvm_vcpu_cache);
1365out_free_5:
1366 sysdev_unregister(&kvm_sysdev);
1367out_free_4:
1368 sysdev_class_unregister(&kvm_sysdev_class);
1369out_free_3:
1370 unregister_reboot_notifier(&kvm_reboot_notifier);
1371 unregister_cpu_notifier(&kvm_cpu_notifier);
1372out_free_2:
1373 on_each_cpu(hardware_disable, NULL, 0, 1);
1374out_free_1:
1375 kvm_arch_hardware_unsetup();
1376out_free_0:
1377 __free_page(bad_page);
1378out:
1379 kvm_arch_exit();
1380 kvm_exit_debug();
1381out_fail:
1382 return r;
1383}
1384EXPORT_SYMBOL_GPL(kvm_init);
1385
1386void kvm_exit(void)
1387{
1388 misc_deregister(&kvm_dev);
1389 kmem_cache_destroy(kvm_vcpu_cache);
1390 sysdev_unregister(&kvm_sysdev);
1391 sysdev_class_unregister(&kvm_sysdev_class);
1392 unregister_reboot_notifier(&kvm_reboot_notifier);
1393 unregister_cpu_notifier(&kvm_cpu_notifier);
1394 on_each_cpu(hardware_disable, NULL, 0, 1);
1395 kvm_arch_hardware_unsetup();
1396 kvm_arch_exit();
1397 kvm_exit_debug();
1398 __free_page(bad_page);
1399}
1400EXPORT_SYMBOL_GPL(kvm_exit);